def __init__(self, d_model): super().__init__() self.d_model = d_model self.num_relations = 40 self.fc_dir_weight = clones(nn.Linear(d_model, d_model, bias=False), 3) self.fc_dir_bias = [ nn.Parameter(torch.zeros(d_model)) for _ in range(self.num_relations * 2 - 1) ] self.fc_dir_bias1 = nn.ParameterList(self.fc_dir_bias[-1:]) self.fc_dir_bias2 = nn.ParameterList( self.fc_dir_bias[:self.num_relations - 1]) self.fc_dir_bias3 = nn.ParameterList( self.fc_dir_bias[self.num_relations - 1:-1]) self.fc_gate_weight = clones(nn.Linear(d_model, d_model, bias=False), 3) self.fc_gate_bias = [ nn.Parameter(torch.zeros(d_model)) for _ in range(self.num_relations * 2 - 1) ] self.fc_gate_bias1 = nn.ParameterList(self.fc_gate_bias[-1:]) self.fc_gate_bias2 = nn.ParameterList( self.fc_gate_bias[:self.num_relations - 1]) self.fc_gate_bias3 = nn.ParameterList( self.fc_gate_bias[self.num_relations - 1:-1])
def __init__(self, size, self_attn, feed_forward, dropout): super(EncoderLayer, self).__init__() # self.self_attn is the Multi-Head Attention Layer self.self_attn = self_attn self.feed_forward = feed_forward self.sublayerconnections = clones(SublayerConnection(size, dropout), 2) self.size = size
def __init__(self, dm, dropout=0.1): super(EncoderBlock, self).__init__() self.pe = PositionalEncoding(dm, dropout) self.self_attn = Attn() self.ffn = PositionWiseFFN(dm, dm // 2) self.dropout = dropout self.highways = utils.clones(HighWay(dm, dropout), 2)
def __init__(self, size, dropout, self_attn, feed_forward): super(EncoderLayer, self).__init__() self.size = size self.dropout = dropout self.self_attn = self_attn self.sublayers = clones(SubLayer(size, dropout), 2) self.feed_forward = feed_forward
def __init__(self, size, self_attn, feed_forward, dropout): super(EncoderLayer, self).__init__() self.self_attn = self_attn self.feed_forward = feed_forward self.sublayer = utils.clones(SublayerConnection(size, dropout), 2) self.size = size self.local_rnn = LocalRNNLayer(size, dropout)
def __init__(self, size, self_attn, src_attn, feed_forward, dropout): super(DecoderLayer, self).__init__() self.size = size self.self_attn = self_attn self.src_attn = src_attn self.feed_forward = feed_forward self.sublayer = clones(SublayerConnection(size, dropout), 3)
def __init__(self, head, d_embedding, dropout=0.1): super(MultiHeadAttention, self).__init__() assert d_embedding % head == 0 self.d_k = d_embedding // head self.head = head self.linears = clones(nn.Linear(d_embedding, d_embedding), 4) self.attn = None self.dropout = nn.Dropout(p=dropout)
def __init__(self, N, d_model, h, dropout, bidirectional, mix=False): super(Transformer, self).__init__() attn = MultiHeadedAttention(h, d_model) ff = PositionwiseFeedForward(d_model, dropout=dropout) self.bidirectional = bidirectional self.model = Encoder(EncoderLayer(d_model, attn, ff, dropout), N, mix) if self.bidirectional: self.model = clones(self.model, 2)
def __init__(self, num_attention_layers, dim_model, dropout=0.1): super(MultiHeadAttention, self).__init__() self.h = num_attention_layers self.dropout = dropout self.d_model = dim_model self.d_k = dim_model // self.h self.linears = clones(nn.Linear(dim_model, dim_model), 4) self.attn = None self.drop = nn.Dropout(self.dropout)
def __init__(self, h, d_model, dropout=0.1): # "词向量长度和多头数目" super(MultiHeadedAttention, self).__init__() assert d_model % h == 0 self.d_k = d_model // h self.h = h self.linears = clones(nn.Linear(d_model, d_model), 4) self.attn = None self.dropout = nn.Dropout(p=dropout)
def __init__(self, model_dim, head_count, dropout): super(MultiHeadAttn, self).__init__() self.model_dim = model_dim assert model_dim % head_count == 0 self.dim_per_head = model_dim // head_count self.head_count = head_count self.linear_layers = clones(nn.Linear(model_dim, model_dim), 4) self.dropout = nn.Dropout(dropout)
def __init__(self, decoder_layer, num_layers): """Initializer. Args: decoder_layer: (DecoderLayer). num_layers: (int) number of decoder layers in stack. """ super(Decoder, self).__init__() self.decoder_layers = utils.clones(decoder_layer, num_layers) self.layer_norm = LayerNorm(decoder_layer.model_size)
def __init__(self, size, dropout, self_attn, src_attn, feed_forward, d_model, vocab): super(DecoderLayer, self).__init__() self.size = size self.dropout = nn.Dropout(dropout) self.attn = self_attn self.src_attn = src_attn self.sub_layers = clones(SubLayer(size, dropout), 3) self.feed_forward = feed_forward self.generator = Generator(d_model=d_model, vocab=vocab)
def __init__(self, h, d_model, dropout=0.1): "Take in model size and number of heads." super(MultiHeadedAttention, self).__init__() assert d_model % h == 0 # We assume d_v always equals d_k self.d_k = d_model // h self.h = h self.linears = clones(linear.Linear(d_model, d_model), 4) self.attn = None self.dropout = nn.Dropout(p=dropout)
def __init__(self, h: int, d_model: int, dropout=0.1): """Take in model size and number of heads""" super(MultiHeadAttention, self).__init__() assert d_model % h == 0 self.d_k = d_model // h # ex: h=8, d_model=512, d_k=64 self.h = h # 4 linear modules from d_model to d_model self.linears = clones(nn.Linear(d_model, d_model), 4) self.attn = None # attention weights computed self.dropout = nn.Dropout(p=dropout)
def __init__(self, size: int, self_attn: MultiHeadAttention, feed_forward: PositionwiseFeedForward, dropout=0.1): super(EncoderLayer, self).__init__() self.size = size self.self_attn = self_attn self.feed_forward = feed_forward # 2 sub-layers: 1 self-attention + 1 feed-forward self.sublayer = clones(SublayerConnection(size, dropout), 2)
def __init__(self, d_model, k, num_heads, num_features, dropout=0): super(TreeRelativePosition, self).__init__() self.d_model = d_model self.k = k self.num_features = num_features self.num_heads = num_heads self.dropout = nn.Dropout(dropout) self.emb_list = clones(nn.Embedding(2 * k + 2, d_model * 2), num_features)
def __init__(self, h, d_model, dropout=0.1): "Take in model size and number of heads." super(MultiHeadedAttention, self).__init__() assert d_model % h == 0 # We assume d_v always equals d_k self.d_k = d_model // h # The output dim of each head self.h = h # Number of heads # The former 3 linear modules are the combined {W_i}^Q, {W_i}^K, {W_i}^V self.linears = clones(nn.Linear(d_model, d_model), 4) self.attn = None self.dropout = nn.Dropout(p=dropout)
def __init__(self, d_model=512, h=8, d_ff=2048, dropout_rate=0.1): super(DecoderLayer, self).__init__() self.self_attn = MultiHeadedAttention(h, d_model, dropout_rate=dropout_rate) self.src_attn = MultiHeadedAttention(h, d_model, dropout_rate=dropout_rate) self.feed_forward = PositionwiseFeedForward(d_model, d_ff, dropout_rate=dropout_rate) self.sublayers = clones(SublayerConnection(d_model, dropout_rate), 3)
def __init__(self, size, self_attn, src_attn, feed_forward, dropout): super(DecoderLayer, self).__init__() # size = d_embedding = 512, # self_attn: one MultiHeadAttention object between tgt_vocab # src_attn: second MultiHeadAttention object, betweem tgt_vocab and src_vocab # feed_forward: the last fully-connection layer # dropout = 0.1 self.size = size self.self_attn = self_attn self.src_attn = src_attn self.feed_forward = feed_forward # Three SublayerConnection objects: # self.self_attn, self.src_attn, and self.feed_forward self.sublayer = clones(SublayerConnection(size, dropout), 3)
def __init__(self, d_model, h, dropout=0.1): """ multi-head attention :param h: nhead :param d_model: d_model :param dropout: float """ super(MultiHeadedAttention, self).__init__() assert d_model % h == 0 # assume d_v always equals d_k self.d_k = d_model // h self.h = h self.linears = utils.clones(nn.Linear(d_model, d_model), 4) self.attn = None self.dropout = nn.Dropout(p=dropout)
def __init__(self, h, d_model, dropout=0.1): super(MultiHeadAttention, self).__init__() #print d_model, h assert d_model % h == 0 # self.d_k is the reduced dimension of each parallel attention self.d_k = d_model // h self.h = h # self.linears is a list consists of 4 projection layers # self.linears[0]: Concat(W^Q_i), where i \in [1,...,h]. # self.linears[1]: Concat(W^K_i), where i \in [1,...,h]. # self.linears[2]: Concat(W^K_i), where i \in [1,...,h]. self.linears = clones(nn.Linear(d_model, d_model), 4) self.attn = None self.dropout = nn.Dropout(p=dropout)
def __init__(self, config): super(TOI_BERT, self).__init__() self.config = config self.outfile = None self.input_size_bert = config.input_size_bert self.input_size = self.input_size_bert if config.use_bert else 0 if self.config.if_DTE: self.dte = DTE(config) self.input_size += self.dte.input_size if self.config.use_cnn: self.res_nets = clones(TOI_CNN_RES(self.input_size, self.input_size, kernal_size=self.config.kernel_size), self.config.cnn_block) else: self.project = nn.Sequential( nn.Linear(self.input_size, self.input_size), nn.Dropout(0.5), nn.ReLU() ) self.hat_1 = TOI_Pooling(self.input_size, self.config.if_gpu, self.config.hit_pooling_size) self.pooling_size = 2 + self.config.hit_pooling_size self.one_step_to_share=nn.Sequential( nn.Linear(self.input_size * self.pooling_size, self.config.nested_depth_fc_size), nn.Dropout(0.5), nn.ReLU(), ) self.one_step_to_heaven = nn.Sequential( nn.Linear(self.config.nested_depth_fc_size, self.config.label_kinds), ) self.one_step_to_hell = nn.Sequential( nn.Linear(self.config.nested_depth_fc_size, self.config.nested_depth), ) if self.config.fusion: self.fusion = nn.Sequential( nn.Softmax(dim = 0) ) self.fusion_parameters = torch.nn.Parameter(torch.ones(config.fusion_layer, 1)) self.fusion_gamma = torch.nn.Parameter(torch.ones(1)) self.cls_ce_loss = nn.CrossEntropyLoss()
def __init__(self, h, d_k, d_model, p_drop): ''' In the paper: h = 8, d_k = d_v = 64, d_model = 512, p_drop = 0.1. Assume d_k = d_v Check whether d_model is a multiple of h ''' super(MultiHeadAttention, self).__init__() assert d_model % h == 0 self.h = h self.d_k = d_k self.d_model = d_model self.p_drop = p_drop self.linear_layers = clones(nn.Linear(in_features = d_model, out_features = d_model, bias = False),4) self.attention = None self.layernorm = nn.LayerNorm(normalized_shape = d_model) self.drop = nn.Dropout(p = p_drop)
def __init__(self, model_size, multi_headed_attention, feed_foward, dropout_rate): """Initializer. Args: model_size: (int) model input feature size. multi_headed_attention: (MultiHeadedAttention). feed_forward: (PositionwiseFeedForward). dropout_rate: (float) dropout rate. """ super(EncoderLayer, self).__init__() self.self_attention = multi_headed_attention self.feed_forward = feed_foward # Two sublayers: one applied after self-attention, the other # applied after position-wise feedforward. self.sublayer = utils.clones(Sublayer(model_size, dropout_rate), 2) self.model_size = model_size
def __init__(self, num_heads, model_size, dropout_rate=0.1): """Initializer. Args: num_heads: (int) number of attention heads. model_size: (int) model input feature size. dropout_rate: (float) dropout rate. """ super(MultiHeadedAttention, self).__init__() assert model_size % num_heads == 0 self.head_size = model_size // num_heads self.num_heads = num_heads # Linear projections for query, key, value, and the output # of multi-headed attention layer. 4 in total. self.linears = utils.clones(nn.Linear(model_size, model_size), 4) self.attention = None self.dropout = nn.Dropout(p=dropout_rate)
def __init__(self, d_model, h, max_relative_position, dropout=.0): """ multi-head attention :param h: nhead :param d_model: d_model :param dropout: float """ super(MultiHeadedAttention_RPR, self).__init__() assert d_model % h == 0 # assume d_v always equals d_k self.d_k = d_model // h self.h = h self.linears = utils.clones(nn.Linear(d_model, d_model), 4) self.dropout = nn.Dropout(p=dropout) self.max_relative_position = max_relative_position self.vocab_size = max_relative_position * 2 + 1 self.embed_K = nn.Embedding(self.vocab_size, self.d_k) self.embed_V = nn.Embedding(self.vocab_size, self.d_k)
def __init__(self, h, d_model, dropout=0.1): """ :param h: num of heads(parallel attention layers) :param d_model: dimension of input(embedding) :parameter linears: Wq_i [d_model, d_k] * h Wk_i [d_model, d_k] * h Wv_i [d_model, d_v] * h Wo [d_v * h, d_model] """ super(MultiHeadedAttention, self).__init__() assert d_model % h == 0 # We assume d_v always equals d_k self.d_k = d_model // h self.h = h self.linears = clones(nn.Linear(d_model, d_model), 4) self.attn = None self.dropout = nn.Dropout(p=dropout)
def __init__(self, model_size, multi_headed_attention_1, multi_headed_attention_2, feed_foward, dropout_rate): """Initializer. Args: model_size: (int) model input feature size. multi_headed_attention_1: (MultiHeadedAttention). multi_headed_attention_2: (MultiHeadedAttention). feed_forward: (PositionwiseFeedForward). dropout_rate: (float) dropout rate. """ super(DecoderLayer, self).__init__() self.self_attention = multi_headed_attention_1 self.source_attention = multi_headed_attention_2 self.feed_forward = feed_foward # Three sublayers: one applied after self-attention, one applied # to after source attention, the last applied after # position-wise feedforward. self.sublayer = utils.clones(Sublayer(model_size, dropout_rate), 3) self.model_size = model_size
def __init__(self, size, self_attn, feed_forward, dropout): # size=d_embedding=512 # self_attn = an object of MultiHeadAttention, first sublayer # feed_forward = an object of PositionwiseFeedForward,second sublayer # dropout = 0.1 (e.g.) super(EncoderLayer, self).__init__() self.self_attn = self_attn self.feed_forward = feed_forward self.sublayer = clones(SublayerConnection(size, dropout), 2) self.size = size def forward(self, x, mask): # x: (batch, num_word, d_embedding) # mask: (batch.size, num_word, num_word), padding mask in Encoder # in src_vocab, all the words except the "<blank>" ones (padding mask) are visible # in tgt_vocab, all the words in the left of current input word are visible x = self.sublayer[0](x, self.self_attn(x, x, x, mask)) # x: (batch, num_word, d_embedding), self_attn (MultiHeadAttention) # shape is same: (batch, num_word, d_embedding) # -> SublayerConnection: (batch, num_word, d_embedding) return self.sublayer[1](x, self.feed_forward)