def __init__(self, args, pretrained): super(Transfrmr_bidaf, self).__init__() self.embed = embed.Embedding(args, pretrained) # Encoder module self.encoder_ctxt = encode.Encoder_block(args, 2 * args.word_dim) self.encoder_ques = encode.Encoder_block(args, 2 * args.word_dim) #Attention Flow Layer self.att_weight_c = Linear(args.hidden_size * 2, 1, args.dropout) self.att_weight_q = Linear(args.hidden_size * 2, 1, args.dropout) self.att_weight_cq = Linear(args.hidden_size * 2, 1, args.dropout) self.N = args.Model_encoder_size self.dropout = nn.Dropout(p=args.dropout) #Model Encoding Layer self.Model_encoder = self.get_clones( encode.Encoder_block(args, 8 * args.word_dim), args.Model_encoder_size) # self.Model2start= Linear(16 * args.word_dim, 8 * args.word_dim,args.dropout) # self.Model2end = Linear(16 * args.word_dim, 8 * args.word_dim,args.dropout) # self.start_idx = Linear(16 * args.word_dim,1,args.dropout) # self.end_idx = Linear(16 * args.word_dim, 1, args.dropout) self.start_idx = nn.Linear(16 * args.word_dim, 1) self.end_idx = nn.Linear(16 * args.word_dim, 1)
def __init__(self, context_length, embedding_size, dropout=0.0): """ Initialise parameters and layers for Predictor. :param context_length: length of the context :param embedding_size: hidden embedding size (d2 in the paper) :param dropout: dropout rate """ super(Predictor, self).__init__() # the following build on this d2 = embedding_size self.f0 = nn.LSTM(d2, d2) # input_site, output_size self.f1 = nn.LSTM(2 * d2, d2) self.f2 = nn.LSTM(3 * d2, d2) self.f3 = nn.LSTM(3 * d2, d2) self.linear_sup = Linear( d2, 2, dropout=dropout ) # have 2 output dims because we need to weight the classes self.linear_start = Linear( d2, 1, dropout=dropout ) # with a softmax because there can only be one start or end self.linear_end = Linear(d2, 1, dropout=dropout) self.linear_type = Linear( d2, 3, dropout=dropout) # 3 because we have 3 types - yes, no, and span
def global_attention(query): # linear map y = Linear(query, global_attention_vec_size, True) y = y.view(-1, 1, 1, global_attention_vec_size) # Attention mask is a softmax of v_g^{\top} * tanh(...) s = torch.sum(global_v * torch.tanh(global_hidden_features + y), dim=[1, 3]) a = tf.softmax(s) return a
def local_attention(query): # linear map y = Linear(query, local_attention_vec_size, True) y = y.view(-1, 1, 1, local_attention_vec_size) # Attention mask is a softmax of v_l^{\top} * tanh(...) #print((local_v * torch.tanh(local_hidden_features + y)).size()) s = torch.sum(local_v * torch.tanh(local_hidden_features + y), dim=[1, 3]) # Now calculate the attention-weighted vector, i.e., alpha in eq.[2] a = tf.softmax(s) return a
def attention(query): # linear map y = Linear(query, attention_vec_size, True) y = y.view(-1, 1, 1, attention_vec_size) # Attention mask is a softmax of v_d^{\top} * tanh(...). s = torch.sum(v * torch.tanh(hidden_features + y), dim=[1, 3]) # Now calculate the attention-weighted vector, i.e., gamma in eq.[7] a = tf.softmax(s) # eq. [8] #print(hidden.size()) #print((a.view(-1, 1, attn_length, 1)).size()) d = torch.sum(a.view(-1, 1, attn_length, 1) * hidden, dim=[2, 3]) return d.view(-1, attn_size)
def global_attention(query): """Put attention masks on global_hidden using global_hidden_features and query.""" # If the query is a tuple (when stacked RNN/LSTM), flatten it if nest.is_sequence(query): query_list = nest.flatten(query) for q in query_list: # Check that ndims == 2 if specified. ndims = q.get_shape().ndims if ndims: assert ndims == 2 query = array_ops.concat(query_list, 1) with tf.variable_scope("AttnWg"): # linear map y = Linear(query, global_attention_vec_size, True) y = array_ops.reshape( y, [-1, 1, 1, global_attention_vec_size]) # Attention mask is a softmax of v_g^{\top} * tanh(...) s = math_ops.reduce_sum( global_v * math_ops.tanh(global_hidden_features + y), [2, 3]) # Sometimes it's not easy to find a measurement to denote similarity between sensors, # here we omit such prior knowledge in eq.[4]. # You can use "a = nn_ops.softmax((1-lambda)*s + lambda*sim)" to encode similarity info, # where: # sim: a vector with length n_sensors, describing the sim between the target sensor and the others # lambda: a trade-off. a = nn_ops.softmax(s) # a = nn_ops.softmax((1 - lambda) * s + lambda * sim) return a
def step(self, CurrentInput, PrevState): """ Takes an input and the previous 'state' of the NTM and returns the output and the next state. The 'states' are a tuple of two lists of weights (in order of read heads, then write heads), and the memory (as a matrix). We may also want to see how the memory is being accessed at each step, in which case we would append the read and write vectors to the output. Weights: list of shape (1,MemoryLength) Memory: shape (MemoryDepth, MemoryLength) CurrentInput: shape (1,InputDepth) """ ReadWeights, WriteWeights, Memory = PrevState ReadInputs = [ReadMemory(W, Memory) for W in ReadWeights] ControlInput = tf.concat(1, ReadInputs + [CurrentInput]) # now we should put in a control network that takes ControlInput-size inputs and ControlState = tf.tanh( Linear(ControlInput, self.Params.ControlHiddenSize, 'Controller')) # returns a 'control-state' Output = tf.sigmoid( Linear(ControlState, self.Params.InputDepth, 'Output')) NextReadWeights = [] NextWriteWeights = [] Adds = [] Erases = [] for i in xrange(self.Params.nReadHeads): with tf.variable_scope('ReadHead%d' % i): NextReadWeights.append( self.HeadUpdate(ControlState, ReadWeights[i], Memory)) for i in xrange(self.Params.nWriteHeads): with tf.variable_scope('WriteHead%d' % i): W, E, A = self.HeadUpdate(ControlState, WriteWeights[i], Memory, IsWrite=True) NextWriteWeights.append(W) Erases.append(E) Adds.append(A) for i in xrange(self.Params.nWriteHeads): Memory = WriteMemory(NextWriteWeights[i], Erases[i], Adds[i], Memory) return Output, (NextReadWeights, NextWriteWeights, Memory)
def HeadUpdate(self, ControlState, PrevWeights, Memory, IsWrite=False): """ For one head, takes the control state, previous weight and memory, and outputs the new weight, and for write-heads, the erase and add vectors as well. """ KeyVector = tf.tanh( Linear(ControlState, self.Params.MemoryDepth, 'KeyVector')) KeyStrength = tf.nn.softplus(Linear(ControlState, 1, 'KeyStrength')) Gate = tf.sigmoid(Linear(ControlState, 1, 'Gate')) ShiftWeights = tf.nn.softmax( Linear(ControlState, len(self.Params.ShiftOffsets), 'ShiftWeights')) Sharpen = tf.nn.softplus(Linear(ControlState, 1, 'Sharpen')) + 1. Weights = tf.exp(KeyStrength * cosine_similarity(KeyVector, Memory)) Weights /= tf.reduce_sum(Weights, 1) Weights = Gate * Weights + (1.0 - Gate) * PrevWeights Weights = circular_convolution(Weights, ShiftWeights, self.Params.ShiftOffsets) Weights = tf.pow(Weights, Sharpen) Weights /= tf.reduce_sum(Weights, 1) if IsWrite: Erase = tf.sigmoid( Linear(ControlState, self.Params.MemoryDepth, 'Erase')) Add = tf.tanh(Linear(ControlState, self.Params.MemoryDepth, 'Add')) return Weights, Erase, Add else: return Weights
def __init__(self, hpm, rand_unif_init, rand_norm_init): self.hpm = hpm self.rand_unif_init = rand_unif_init self.rand_norm_init = rand_norm_init with tf.variable_scope('encoder'): self.lstm_cell_fw = tf.contrib.rnn.LSTMCell( self.hpm["hidden_size"], state_is_tuple=True, initializer=self.rand_unif_init) # forward lstm cell self.lstm_cell_bw = tf.contrib.rnn.LSTMCell( self.hpm["hidden_size"], state_is_tuple=True, initializer=self.rand_unif_init) # backward lstm cell self.w_c = Linear( self.hpm['hidden_size'], True, "reduce_c", self.rand_norm_init ) # Parameters for the concatenated state linear transf. self.w_h = Linear( self.hpm['hidden_size'], True, 'reduce_h', self.rand_norm_init ) # Parameters for the concatenated hidden output linear transf.
def __init__(self, n_enc_1, n_enc_2, n_enc_3, n_dec_1, n_dec_2, n_dec_3, n_input, n_z): super(AE, self).__init__() # encoder self.enc_1 = Linear(n_input, n_enc_1) self.enc_2 = Linear(n_enc_1, n_enc_2) self.enc_3 = Linear(n_enc_2, n_enc_3) self.z_layer = Linear(n_enc_3, n_z) # decoder self.dec_1 = Linear(n_z, n_dec_1) self.dec_2 = Linear(n_dec_1, n_dec_2) self.dec_3 = Linear(n_dec_2, n_dec_3) self.x_bar_layer = Linear(n_dec_3, n_input)
def local_attention(query): """Put attention masks on local_hidden using local_hidden_features and query.""" # If the query is a tuple (when stacked RNN/LSTM), flatten it if nest.is_sequence(query): query_list = nest.flatten(query) for q in query_list: ndims = q.get_shape().ndims if ndims: assert ndims == 2 query = array_ops.concat(query_list, 1) with tf.variable_scope("AttnWl"): # linear map y = Linear(query, local_attention_vec_size, True) y = array_ops.reshape( y, [-1, 1, 1, local_attention_vec_size]) # Attention mask is a softmax of v_l^{\top} * tanh(...) s = math_ops.reduce_sum( local_v * math_ops.tanh(local_hidden_features + y), [2, 3]) # Now calculate the attention-weighted vector, i.e., alpha in eq.[2] a = nn_ops.softmax(s) return a
def __init__(self, hpm, rand_unif_init, rand_norm_init): self.rand_unif_init = rand_unif_init self.rand_norm_init = rand_norm_init self.hpm = hpm with tf.variable_scope('attention_decoder', reuse=tf.AUTO_REUSE): self.decoder = Decoder( self.hpm, self.rand_unif_init ) # simple decoder object (unidirecitional lstm) # Almost all the parameters (weights and biases) for the linear transformations (see below in the call method) self.w_h = Linear(self.hpm['attn_hidden_size'], True, "h") self.w_s = Linear(self.hpm['attn_hidden_size'], True, "s") self.v = Linear(1, False, 'V') self.w_dec = Linear(self.hpm['emb_size'], True, "dec_inp") self.w_out = Linear(self.hpm['vocab_size'], True, 'out') if self.hpm['pointer_gen']: self.w_c_reduce = Linear(1, True, 'c_reduce') self.w_s_reduce = Linear(1, True, 's_reduce') self.w_i_reduce = Linear(1, True, 'i_reduce')
def attention(query): """Put attention masks on local_hidden using local_hidden_features and query.""" # If the query is a tuple (when stacked RNN/LSTM), flatten it if nest.is_sequence(query): query_list = nest.flatten(query) for q in query_list: # Check that ndims == 2 if specified. ndims = q.get_shape().ndims if ndims: assert ndims == 2 query = array_ops.concat(query_list, 1) with vs.variable_scope("Attn_Wpd"): # linear map y = Linear(query, attention_vec_size, True) y = array_ops.reshape(y, [-1, 1, 1, attention_vec_size]) # Attention mask is a softmax of v_d^{\top} * tanh(...). s = math_ops.reduce_sum( v * math_ops.tanh(hidden_features + y), [2, 3]) # Now calculate the attention-weighted vector, i.e., gamma in eq.[7] a = nn_ops.softmax(s) # eq. [8] d = math_ops.reduce_sum( array_ops.reshape(a, [-1, attn_length, 1, 1]) * hidden, [1, 2]) return array_ops.reshape(d, [-1, attn_size])
def __init__(self, args, pretrained): super(BiDAF, self).__init__() self.args = args # 1. Character Embedding Layer self.char_embedding = nn.Embedding(args.char_vocab_size, args.char_dim, padding_idx=1) nn.init.uniform_(self.char_embedding.weight, -0.001, 0.001) self.char_convolution = nn.Sequential( nn.Conv2d(1, args.char_channel_size, (args.char_dim, args.char_channel_width)), nn.ReLU()) # 2. Word Embedding Layer # initialize word embedding with GloVe self.word_embedding = nn.Embedding.from_pretrained(pretrained, freeze=True) # highway network assert self.args.hidden_size * 2 == (self.args.char_channel_size + self.args.word_dim) for i in range(2): setattr( self, 'highway_linear{}'.format(i), nn.Sequential( Linear(args.hidden_size * 2, args.hidden_size * 2), nn.ReLU())) setattr( self, 'highway_gate{}'.format(i), nn.Sequential( Linear(args.hidden_size * 2, args.hidden_size * 2), nn.Sigmoid())) # 3. Contextual Embedding Layer self.context_LSTM = LSTM(input_size=args.hidden_size * 2, hidden_size=args.hidden_size, bidirectional=True, batch_first=True, dropout=args.dropout) # 4. Attention Flow Layer self.att_weight_c = Linear(args.hidden_size * 2, 1) self.att_weight_q = Linear(args.hidden_size * 2, 1) self.att_weight_cq = Linear(args.hidden_size * 2, 1) # 5. Modeling Layer self.modeling_LSTM1 = LSTM(input_size=args.hidden_size * 8, hidden_size=args.hidden_size, bidirectional=True, batch_first=True, dropout=args.dropout) self.modeling_LSTM2 = LSTM(input_size=args.hidden_size * 2, hidden_size=args.hidden_size, bidirectional=True, batch_first=True, dropout=args.dropout) # 6. Output Layer self.p1_weight_g = Linear(args.hidden_size * 8, 1, dropout=args.dropout) self.p1_weight_m = Linear(args.hidden_size * 2, 1, dropout=args.dropout) self.p2_weight_g = Linear(args.hidden_size * 8, 1, dropout=args.dropout) self.p2_weight_m = Linear(args.hidden_size * 2, 1, dropout=args.dropout) self.output_LSTM = LSTM(input_size=args.hidden_size * 2, hidden_size=args.hidden_size, bidirectional=True, batch_first=True, dropout=args.dropout) self.dropout = nn.Dropout(p=args.dropout)
def temporal_attention(self, decoder_inputs, external_inputs, initial_state, attention_states, cell, output_size=None, loop_function=None, dtype=tf.float32, scope=None, initial_state_attention=False, external_flag=True): """ Temporal attention in GeoMAN Args: decoder_inputs: A list (length: n_steps_decoder) of 2D Tensors [batch_size, n_input_decoder]. external_inputs: A list (length: n_steps_decoder) of 2D Tensors [batch_size, n_external_input]. initial_state: 2D Tensor [batch_size, cell.state_size]. attention_states: 3D Tensor [batch_size, n_step_encoder, n_hidden_encoder]. cell: core_rnn_cell.RNNCell defining the cell function and size. output_size: Size of the output vectors; if None, we use cell.output_size. loop_function: the loop function we use. dtype: The dtype to use for the RNN initial state (default: tf.float32). scope: VariableScope for the created subgraph; default: "tempotal_attention". initial_state_attention: If False (default), initial attentions are zero. external_flag: whether to use external factors Return: A tuple of the form (outputs, state), where: outputs: A list of the same length as the inputs of decoder of 2D Tensors of shape [batch_size x output_size] state: The state of each decoder cell the final time-step. """ # check inputs if not decoder_inputs: raise ValueError( "Must provide at least 1 input to attention decoder.") if not external_inputs: raise ValueError( "Must provide at least 1 ext_input to attention decoder.") if attention_states.get_shape()[2].value is None: raise ValueError("Shape[2] of attention_states must be known: %s" % attention_states.get_shape()) if output_size is None: output_size = cell.output_size # implement of temporal attention with vs.variable_scope(scope or "temporal_attn", dtype=dtype) as scope: dtype = scope.dtype # Needed for reshaping. batch_size = array_ops.shape(decoder_inputs[0])[0] attn_length = attention_states.get_shape()[1].value if attn_length is None: attn_length = array_ops.shape(attention_states)[1] attn_size = attention_states.get_shape()[2].value # A trick: to calculate W_d * h_o by a 1-by-1 convolution # See at eq.[6] in the paper hidden = array_ops.reshape( attention_states, [-1, attn_length, 1, attn_size]) # need to reshape before # Size of query vectors for attention. attention_vec_size = attn_size w = vs.get_variable("Attn_Wd", [1, 1, attn_size, attention_vec_size]) # W_d hidden_features = nn_ops.conv2d(hidden, w, [1, 1, 1, 1], "SAME") # W_d * h_o v = vs.get_variable("Attn_v", [attention_vec_size]) # v_d state = initial_state def attention(query): """Put attention masks on local_hidden using local_hidden_features and query.""" # If the query is a tuple (when stacked RNN/LSTM), flatten it if nest.is_sequence(query): query_list = nest.flatten(query) for q in query_list: # Check that ndims == 2 if specified. ndims = q.get_shape().ndims if ndims: assert ndims == 2 query = array_ops.concat(query_list, 1) with vs.variable_scope("Attn_Wpd"): # linear map y = Linear(query, attention_vec_size, True) y = array_ops.reshape(y, [-1, 1, 1, attention_vec_size]) # Attention mask is a softmax of v_d^{\top} * tanh(...). s = math_ops.reduce_sum( v * math_ops.tanh(hidden_features + y), [2, 3]) # Now calculate the attention-weighted vector, i.e., gamma in eq.[7] a = nn_ops.softmax(s) # eq. [8] d = math_ops.reduce_sum( array_ops.reshape(a, [-1, attn_length, 1, 1]) * hidden, [1, 2]) return array_ops.reshape(d, [-1, attn_size]) if initial_state_attention: attn = attention(initial_state) else: batch_attn_size = array_ops.stack([batch_size, attn_size]) attn = array_ops.zeros(batch_attn_size, dtype=dtype) attn.set_shape([None, attn_size]) i = 0 outputs = [] prev = None for inp, ext_inp in zip(decoder_inputs, external_inputs): if i > 0: vs.get_variable_scope().reuse_variables() # If loop_function is set, we use it instead of decoder_inputs. if loop_function is not None and prev is not None: with vs.variable_scope("loop_function", reuse=True): inp = loop_function(prev, i) # Merge input and previous attentions into one vector of the right size. input_size = inp.get_shape().with_rank(2)[1] if input_size.value is None: raise ValueError( "Could not infer input size from input: %s" % inp.name) # we map the concatenation to shape [batch_size, input_size] if external_flag: x = Linear([inp] + [ext_inp] + [attn], input_size, True) else: x = Linear([inp] + [attn], input_size, True) # Run the RNN. cell_output, state = cell(x, state) # Run the attention mechanism. if i == 0 and initial_state_attention: with vs.variable_scope(vs.get_variable_scope(), reuse=True): attn = attention(state) else: attn = attention(state) # Attention output projection with vs.variable_scope("AttnOutputProjection"): output = Linear([cell_output] + [attn], output_size, True) if loop_function is not None: prev = output outputs.append(output) i += 1 return outputs, state
def __init__(self, output_num, normalize, linear_size, input_size): super(Projection, self).__init__() self.normalize = normalize self.l1 = Linear(input_size, linear_size, bn=False, activ='relu') self.l2 = Linear(linear_size, output_num, bn=False, activ=None)
def __init__(self, num_classes, input_size): super(Output, self).__init__() self.l1 = Linear(input_size, num_classes, bn=False, activ=None)
def temporal_attention(self, decoder_inputs, external_inputs, encoder_state, attention_states, cell, external_flag, output_size=64): # Needed for reshaping. batch_size = decoder_inputs[0].data.size(0) attn_length = attention_states.data.size(1) attn_size = attention_states.data.size(2) # A trick: to calculate W_d * h_o by a 1-by-1 convolution # See at eq.[6] in the paper hidden = attention_states.view(-1, attn_size, attn_length, 1) # need to reshape before # Size of query vectors for attention. attention_vec_size = attn_size w_conv = nn.Conv2d(attn_size, attention_vec_size, (1, 1), (1, 1)) hidden_features = w_conv(hidden) #v = Variable(torch.zeros(attention_vec_size)) # v_l v = nn.Parameter(torch.FloatTensor(attention_vec_size)) init.normal(v) def attention(query): # linear map y = Linear(query, attention_vec_size, True) y = y.view(-1, 1, 1, attention_vec_size) # Attention mask is a softmax of v_d^{\top} * tanh(...). s = torch.sum(v * torch.tanh(hidden_features + y), dim=[1, 3]) # Now calculate the attention-weighted vector, i.e., gamma in eq.[7] a = tf.softmax(s) # eq. [8] #print(hidden.size()) #print((a.view(-1, 1, attn_length, 1)).size()) d = torch.sum(a.view(-1, 1, attn_length, 1) * hidden, dim=[2, 3]) return d.view(-1, attn_size) #attn = Variable(torch.zeros(batch_size, attn_size)) attn = nn.Parameter(torch.FloatTensor(batch_size, attn_size)) init.xavier_uniform(attn) i = 0 outputs = [] prev = None for (inp, ext_inp) in zip(decoder_inputs, external_inputs): # Merge input and previous attentions into one vector of the right size. input_size = inp.data.size(1) #print(i, input_size) #input_size是指向量维度 # we map the concatenation to shape [batch_size, input_size] if external_flag: #print(inp.data.size(1),ext_inp.data.size(1),attn.data.size(1)) x = Linear([inp.float()] + [ext_inp.float()] + [attn.float()], input_size, True) else: x = Linear([inp.float()] + [attn.float()], input_size, True) # Run the RNN. #print(x.size()) cell_output, state = cell(x) # Run the attention mechanism. #print(state.size()) attn = attention([state]) # Attention output projection #print(cell_output.size(), attn.size()) output = Linear([cell_output] + [attn], output_size, True) outputs.append(output) i += 1 return outputs, state
def __init__(self, hidden_size): super(BiATT, self).__init__() self.att_weight_c = Linear(hidden_size * 2, 1) self.att_weight_q = Linear(hidden_size * 2, 1) self.att_weight_cq = Linear(hidden_size * 2, 1)