def from_options(cls, dim, attn_type="dot", attn_func="softmax"): str2score = {"dot": DotScorer(), "general": GeneralScorer(dim)} str2func = { "softmax": nn.Softmax(dim=-1), "sparsemax": Sparsemax(dim=-1), "fusedmax": Fusedmax(), "oscarmax": Oscarmax() } score = str2score[attn_type] transform = str2func[attn_func] return cls(score, transform)
def from_options(cls, dim, attn_type="dot", attn_func="softmax", gate_func="softmax"): lemma_attn = AttentionHead.from_options( dim, attn_type=attn_type, attn_func=attn_func) inflection_attn = AttentionHead.from_options( dim, attn_type=attn_type, attn_func=attn_func) attn_output_layer = nn.Sequential( nn.Linear(dim * 2, dim, bias=False), nn.Tanh() ) str2func = { "softmax": nn.Softmax(dim=-1), "sparsemax": Sparsemax(dim=-1) } gate_transform = str2func[gate_func] # try it with bias? gate = nn.Sequential(nn.Linear(dim * 3, 2, bias=True), gate_transform) return cls(lemma_attn, inflection_attn, attn_output_layer, gate)
def from_options(cls, dim, attn_type="dot", attn_func="softmax", gate_func="softmax", combine_gate_input=False, n_global_heads=1, infl_attn_func=None): lemma_attn = AttentionHead.from_options(dim, attn_type=attn_type, attn_func=attn_func) if infl_attn_func == None: infl_attn_func = attn_func inflection_attn = AttentionHead.from_options(dim, attn_type=attn_type, attn_func=infl_attn_func) lemma_out = nn.Sequential(nn.Linear(dim * 2, dim, bias=False), nn.Tanh()) infl_out = nn.Sequential(nn.Linear(dim * 2, dim, bias=False), nn.Tanh()) str2func = { "softmax": nn.Softmax(dim=-1), "sparsemax": Sparsemax(dim=-1) } gate_transform = str2func[gate_func] # try it with bias? if combine_gate_input: # input is global head (1 or more), two local heads and query (decoder state) gate = nn.Sequential( nn.Linear(dim * (n_global_heads + 3), 2, bias=True), gate_transform) else: # input is global head (1 or more) and query (decoder state) gate = nn.Sequential( nn.Linear(dim * (n_global_heads + 1), 2, bias=True), gate_transform) return cls(lemma_attn, inflection_attn, lemma_out, infl_out, gate, combine_gate_input)
def __init__( self, head_count, model_dim, dropout=0.1, attn_func="softmax", attn_alpha=None, attn_bisect_iter=0, ): assert model_dim % head_count == 0 self.dim_per_head = model_dim // head_count self.model_dim = model_dim super(MultiHeadedAttention, self).__init__() self.head_count = head_count self.linear_keys = nn.Linear(model_dim, head_count * self.dim_per_head) self.linear_values = nn.Linear(model_dim, head_count * self.dim_per_head) self.linear_query = nn.Linear(model_dim, head_count * self.dim_per_head) if attn_func == "softmax": self.normalization = nn.Softmax(dim=-1) elif attn_func == "esoftmax": self.normalization = ESoftmax(dim=-1) elif attn_func == "sparsemax": self.normalization = Sparsemax(dim=-1) elif attn_func == "tsallis15": self.normalization = Tsallis15(dim=-1) elif attn_func == "tsallis": self.normalization = TsallisBisect( alpha=attn_alpha, n_iter=attn_bisect_iter ) else: raise ValueError(f"Unsupported attention function: {attn_func}") self.dropout = nn.Dropout(dropout) self.final_linear = nn.Linear(model_dim, model_dim)
import pytest from onmt.modules.sparse_activations import ( Sparsemax, Tsallis15, SparsemaxTopK, Tsallis15TopK, ) from onmt.modules.root_finding import ( sparsemax_bisect, tsallis_bisect, ) funcs = [ Sparsemax(dim=1), Tsallis15(dim=1), SparsemaxTopK(dim=1), Tsallis15TopK(dim=1), sparsemax_bisect, tsallis_bisect, ] @pytest.mark.parametrize('func', funcs) @pytest.mark.parametrize('dtype', (torch.float32, torch.float64)) def test_mask(func, dtype): torch.manual_seed(42) x = torch.randn(2, 6, dtype=dtype) x[:, 3:] = -float('inf') x0 = x[:, :3]