def call(self, x, y, mask, training=False): self.step += 1 x_ = x x = dropout(x, keep_prob=self.keep_prob, training=training) y = dropout(y, keep_prob=self.keep_prob, training=training) if self.step == 0: if not self.identity: self.linear = layers.Dense(melt.get_shape(x, -1), activation=tf.nn.relu) else: self.linear = None # NOTICE shared linear! if self.linear is not None: x = self.linear(x) y = self.linear(y) scores = tf.matmul(x, tf.transpose(y, [0, 2, 1])) if mask is not None: JX = melt.get_shape(x, 1) mask = tf.tile(tf.expand_dims(mask, axis=1), [1, JX, 1]) scores = softmax_mask(scores, mask) alpha = tf.nn.softmax(scores) self.alpha = alpha y = tf.matmul(alpha, y) if self.combine is None: return y else: return self.combine(x_, y, training=training)
def call(self, x, training=False): x = x['comment'] batch_size = melt.get_shape(x, 0) length = melt.length(x) #with tf.device('/cpu:0'): x = self.embedding(x) num_units = [ melt.get_shape(x, -1) if layer == 0 else 2 * self.num_units for layer in range(self.num_layers) ] #print('----------------length', tf.reduce_max(length), inputs.comment.shape) mask_fws = [ melt.dropout(tf.ones([batch_size, 1, num_units[layer]], dtype=tf.float32), keep_prob=self.keep_prob, training=training, mode=None) for layer in range(self.num_layers) ] mask_bws = [ melt.dropout(tf.ones([batch_size, 1, num_units[layer]], dtype=tf.float32), keep_prob=self.keep_prob, training=training, mode=None) for layer in range(self.num_layers) ] #x = self.encode(x, length, mask_fws=mask_fws, mask_bws=mask_bws) x = self.encode(x) x = self.pooling(x, length) #x = self.pooling(x) x = self.logits(x) return x
def call(self, input, training=False): x1 = input['query'] x2 = input['passage'] length1 = melt.length(x1) length2 = melt.length(x2) #with tf.device('/cpu:0'): x1 = self.embedding(x1) x2 = self.embedding(x2) x = x1 batch_size = melt.get_shape(x1, 0) num_units = [ melt.get_shape(x, -1) if layer == 0 else 2 * self.num_units for layer in range(self.num_layers) ] #print('----------------length', tf.reduce_max(length), inputs.comment.shape) mask_fws = [ melt.dropout(tf.ones([batch_size, 1, num_units[layer]], dtype=tf.float32), keep_prob=self.keep_prob, training=training, mode=None) for layer in range(self.num_layers) ] mask_bws = [ melt.dropout(tf.ones([batch_size, 1, num_units[layer]], dtype=tf.float32), keep_prob=self.keep_prob, training=training, mode=None) for layer in range(self.num_layers) ] x = self.encode(x1, length1, x2, length2, mask_fws=mask_fws, mask_bws=mask_bws) x = self.pooling(x, length1, length2) #x = self.pooling(x) if FLAGS.use_type: x = tf.concat([x, tf.expand_dims(tf.to_float(input['type']), 1)], 1) if not FLAGS.split_type: x = self.logits(x) else: x1 = self.logits(x) x2 = self.logits2(x) x = tf.cond(tf.cast(input['type'] == 0, tf.bool), lambda: (x1 + x2) / 2., lambda: x2) return x
def encode(self, seq, seq_len=None, output_method='all'): with tf.variable_scope(self.scope): num_filters = self.num_units seqs = [seq] batch_size = melt.get_batch_size(seq) kernel_sizes = [3, 5, 7, 9, 11, 13] #kernel_sizes = [3] * 7 assert self.num_layers <= len(kernel_sizes) for layer in range(self.num_layers): input_size_ = melt.get_shape(seq, -1) if layer == 0 else num_filters seq = melt.dropout(seq, self.keep_prob, self.is_train) seq = tf.layers.conv1d(seqs[-1], num_filters, kernel_size=kernel_sizes[layer], padding='same', activation=tf.nn.relu) # mask = melt.dropout(tf.ones([batch_size, 1, input_size_], dtype=tf.float32), # keep_prob=self.keep_prob, is_train=self.is_train, mode=None) #seq = tf.layers.conv1d(seqs[-1] * mask, num_filters, kernel_size=3, padding='same', activation=tf.nn.relu) #seq = tf.layers.conv1d(seqs[-1] * mask, num_filters, kernel_size=kernel_sizes[layer], padding='same', activation=tf.nn.relu) # if self.is_train and self.keep_prob < 1: # seq = tf.nn.dropout(seq, self.keep_prob) #seq = melt.layers.batch_norm(seq, self.is_train, name='layer_%d' % layer) seqs.append(seq) outputs = tf.concat(seqs[1:], 2) # not do any dropout in convet just dropout outside # if self.is_train and self.keep_prob < 1: # outputs = tf.nn.dropout(outputs, self.keep_prob) # compact for rnn with sate return return melt.rnn.encode_outputs(outputs, seq_len, output_method)
def call(self, inputs, memory, inputs_mask, memory_mask, training=False): combiner = self.combiner # DotAttention already convert to dot_attention #with tf.variable_scope(self.scope): d_inputs = dropout(inputs, keep_prob=self.keep_prob, training=training) d_memory = dropout(memory, keep_prob=self.keep_prob, training=training) JX = tf.shape(inputs)[1] with tf.variable_scope("attention"): inputs_ = self.inputs_dense(d_inputs) memory_ = self.memory_dense(d_memory) # shared matrix for c2q and q2c attention scores = tf.matmul(inputs_, tf.transpose( memory_, [0, 2, 1])) / (self.hidden**0.5) # c2q attention mask = memory_mask if mask is not None: mask = tf.tile(tf.expand_dims(mask, axis=1), [1, JX, 1]) scores = softmax_mask(scores, mask) alpha = tf.nn.softmax(scores) self.alpha = alpha c2q = tf.matmul(alpha, memory) # TODO check this with allennlp implementation since not good result here... # q2c attention # (batch_size, clen) logits = tf.reduce_max(scores, -1) mask = inputs_mask if mask is not None: logits = softmax_mask(logits, mask) alpha2 = tf.nn.softmax(logits) # inputs (batch_size, clen, dim), probs (batch_size, clen) q2c = tf.matmul(tf.expand_dims(alpha2, 1), inputs) # (batch_size, clen, dim) q2c = tf.tile(q2c, [1, JX, 1]) outputs = tf.concat([c2q, q2c], -1) if self.combine is not None: return self.combine(inputs, outputs, training=training) else: return outputs
def call(self, inputs, memory, mask, self_match=False, training=False): combiner = self.combiner # DotAttention already convert to dot_attention #with tf.variable_scope(self.scope): # TODO... here has some problem might for self match dot attention as same inputs with different dropout...Try self_match == True and verify.. # NOTICE self_match == False following HKUST rnet d_inputs = dropout(inputs, keep_prob=self.keep_prob, training=training) if not self_match: d_memory = dropout(memory, keep_prob=self.keep_prob, training=training) else: d_memory = d_inputs JX = tf.shape(inputs)[1] # TODO remove scope ? with tf.variable_scope("attention"): inputs_ = self.inputs_dense(d_inputs) if not self_match: memory_ = self.memory_dense(d_memory) else: memory_ = inputs_ scores = tf.matmul(inputs_, tf.transpose( memory_, [0, 2, 1])) / (self.hidden**0.5) if mask is not None: mask = tf.tile(tf.expand_dims(mask, axis=1), [1, JX, 1]) #print(inputs_.shape, memory_.shape, weights.shape, mask.shape) # (32, 318, 100) (32, 26, 100) (32, 318, 26) (32, 318, 26) scores = softmax_mask(scores, mask) alpha = tf.nn.softmax(scores) self.alpha = alpha # logits (32, 326, 326) memory(32, 326, 200) outputs = tf.matmul(alpha, memory) if self.combine is not None: return self.combine(inputs, outputs, training=training) else: return outputs
def call(self, x, y, training=False): self.step += 1 #with tf.variable_scope(self.scope): res = tf.concat([x, y], axis=2) dim = melt.get_shape(res, -1) d_res = dropout(res, keep_prob=self.keep_prob, training=training) if self.step == 0: self.dense = layers.Dense(dim, use_bias=False, activation=tf.nn.sigmoid) gate = self.dense(d_res) return res * gate
def call(self, input, training=False): q = input['query'] c = input['passage'] q_len = melt.length(q) c_len = melt.length(c) q_mask = tf.cast(q, tf.bool) q_emb = self.embedding(q) c_emb = self.embedding(c) x = c_emb batch_size = melt.get_shape(x, 0) num_units = [melt.get_shape(x, -1) if layer == 0 else 2 * self.num_units for layer in range(self.num_layers)] mask_fws = [melt.dropout(tf.ones([batch_size, 1, num_units[layer]], dtype=tf.float32), keep_prob=self.keep_prob, training=training, mode=None) for layer in range(self.num_layers)] mask_bws = [melt.dropout(tf.ones([batch_size, 1, num_units[layer]], dtype=tf.float32), keep_prob=self.keep_prob, training=training, mode=None) for layer in range(self.num_layers)] c = self.encode(c_emb, c_len, mask_fws=mask_fws, mask_bws=mask_bws) q = self.encode(q_emb, q_len, mask_fws=mask_fws, mask_bws=mask_bws) qc_att = self.att_dot_attention(c, q, mask=q_mask, training=training) num_units = [melt.get_shape(qc_att, -1) if layer == 0 else 2 * self.num_units for layer in range(self.num_layers)] mask_fws = [melt.dropout(tf.ones([batch_size, 1, num_units[layer]], dtype=tf.float32), keep_prob=self.keep_prob, training=training, mode=None) for layer in range(1)] mask_bws = [melt.dropout(tf.ones([batch_size, 1, num_units[layer]], dtype=tf.float32), keep_prob=self.keep_prob, training=training, mode=None) for layer in range(1)] x = self.att_encode(qc_att, c_len, mask_fws=mask_fws, mask_bws=mask_bws) x = self.pooling(x, c_len) if FLAGS.use_type: x = tf.concat([x, tf.expand_dims(tf.to_float(input['type']), 1)], 1) if not FLAGS.split_type: x = self.logits(x) else: x1 = self.logits(x) x2 = self.logits2(x) x = tf.cond(tf.cast(input['type'] == 0, tf.bool), lambda: (x1 + x2) / 2., lambda: x2) return x
def encode(self, seq, seq_len=None, output_method='all'): with tf.variable_scope(self.scope, reuse=tf.AUTO_REUSE): if self.use_position_encoding: hidden_size = melt.get_shape(seq, -1) # Scale embedding by the sqrt of the hidden size seq *= hidden_size ** 0.5 # Create binary array of size [batch_size, length] # where 1 = padding, 0 = not padding padding = tf.to_float(tf.sequence_mask(seq_len)) # Set all padding embedding values to 0 seq *= tf.expand_dims(padding, -1) pos_encoding = model_utils.get_position_encoding( tf.shape(seq)[1], tf.shape(seq)[-1]) seq = seq + pos_encoding num_filters = self.num_filters seqs = [seq] #batch_size = melt.get_batch_size(seq) #kernel_sizes = [3, 5, 7, 9, 11, 13] kernel_sizes = [3] * 7 assert self.num_layers <= len(kernel_sizes) for layer in range(self.num_layers): #input_size_ = melt.get_shape(seq, -1) if layer == 0 else num_filters seq = melt.dropout(seq, self.keep_prob, self.is_train) seq = tf.layers.conv1d(seqs[-1], num_filters, kernel_size=kernel_sizes[layer], padding='same', activation=tf.nn.relu) # mask = melt.dropout(tf.ones([batch_size, 1, input_size_], dtype=tf.float32), # keep_prob=self.keep_prob, is_train=self.is_train, mode=None) #seq = tf.layers.conv1d(seqs[-1] * mask, num_filters, kernel_size=3, padding='same', activation=tf.nn.relu) #seq = tf.layers.conv1d(seqs[-1] * mask, num_filters, kernel_size=kernel_sizes[layer], padding='same', activation=tf.nn.relu) # if self.is_train and self.keep_prob < 1: # seq = tf.nn.dropout(seq, self.keep_prob) #seq = melt.layers.batch_norm(seq, self.is_train, name='layer_%d' % layer) seqs.append(seq) outputs = tf.concat(seqs[1:], 2) # not do any dropout in convet just dropout outside # if self.is_train and self.keep_prob < 1: # outputs = tf.nn.dropout(outputs, self.keep_prob) # compact for rnn with sate return return melt.rnn.encode_outputs(outputs, seq_len, output_method)
def call(self, x, fusions, training=False): self.step += 1 assert len(fusions) > 0 vectors = tf.concat( [x] + fusions, axis=-1 ) # size = [batch_size, ..., input_dim * (len(fusion_vectors) + 1)] dim = melt.get_shape(x, -1) dv = dropout(vectors, keep_prob=self.keep_prob, training=training) if self.step == 0: self.composition_dense = layers.Dense(dim, use_bias=True, activation=tf.nn.tanh, name='compostion_dense') self.gate_dense = layers.Dense(dim, use_bias=True, activation=tf.nn.sigmoid, name='gate_dense') r = self.composition_dense(dv) g = self.gate_dense(dv) return g * r + (1 - g) * x
def call(self, x, mask, training=False): self.step += 1 x_ = x x = dropout(x, keep_prob=self.keep_prob, training=training) if self.step == 0: if not self.identity: self.linear = layers.Dense(melt.get_shape(x, -1), activation=tf.nn.relu) else: self.linear = None # NOTICE shared linear! if self.linear is not None: x = self.linear(x) scores = tf.matmul(x, tf.transpose(x, [0, 2, 1])) # x = tf.constant([[[1,2,3], [4,5,6],[7,8,9]],[[1,2,3],[4,5,6],[7,8,9]]], dtype=tf.float32) # shape=(2, 3, 3) # z = tf.matrix_set_diag(x, tf.zeros([2, 3])) if not self.diag: # TODO better dim dim0 = melt.get_shape(scores, 0) dim1 = melt.get_shape(scores, 1) scores = tf.matrix_set_diag(scores, tf.zeros([dim0, dim1])) if mask is not None: JX = melt.get_shape(x, 1) mask = tf.tile(tf.expand_dims(mask, axis=1), [1, JX, 1]) scores = softmax_mask(scores, mask) alpha = tf.nn.softmax(scores) self.alpha = alpha x = tf.matmul(alpha, x) if self.combine is None: return y else: return self.combine(x_, x, training=training)
def call(self, seq, seq_len=None, masks=None, output_method=OutputMethod.all, training=False): if self.use_position_encoding: hidden_size = melt.get_shape(seq, -1) # Scale embedding by the sqrt of the hidden size seq *= hidden_size ** 0.5 # Create binary array of size [batch_size, length] # where 1 = padding, 0 = not padding padding = tf.to_float(tf.sequence_mask(seq_len)) # Set all padding embedding values to 0 seq *= tf.expand_dims(padding, -1) pos_encoding = model_utils.get_position_encoding( tf.shape(seq)[1], tf.shape(seq)[-1]) seq = seq + pos_encoding num_filters = self.num_filters seqs = [seq] #batch_size = melt.get_batch_size(seq) for layer in range(self.num_layers): if masks is None: seq_ = melt.dropout(seq, self.keep_prob, training) else: seq_ = seq * masks[layer] seq = self.conv1ds[layer](seq_) seqs.append(seq) outputs = tf.concat(seqs[1:], 2) # not do any dropout in convet just dropout outside # if self.is_train and self.keep_prob < 1: # outputs = tf.nn.dropout(outputs, self.keep_prob) # compact for rnn with sate return return melt.rnn.encode_outputs(outputs, seq_len, output_method)
def call(self, x, sequence_length=None, mask_fws=None, mask_bws=None, concat_layers=None, output_method=None, training=False): concat_layers = concat_layers or self.concat_layers output_mehtod = output_method or self.output_method if self.residual_connect: x = self.residual_linear(x) outputs = [x] #states = [] keep_prob = self.keep_prob num_units = self.num_units batch_size = melt.get_batch_size(x) if sequence_length is None: len_ = melt.get_shape(x, 1) sequence_length = tf.ones([ batch_size, ], dtype=tf.int64) * len_ for layer in range(self.num_layers): input_size_ = melt.get_shape(x, -1) if layer == 0 else 2 * num_units gru_fw, gru_bw = self.gru_fws[layer], self.gru_bws[layer] if self.train_init_state: #init_fw = tf.tile(self.init_fw[layer], [batch_size, 1]) #init_fw = tf.tile(self.init_fw_layer(layer), [batch_size, 1]) init_fw = self.init_fw_layer(layer, batch_size) if self.cell == 'lstm': init_fw = (init_fw, self.init_fw2_layer(layer, batch_size)) else: init_fw = None if self.recurrent_dropout: if mask_fws is not None: mask_fw = mask_fws[layer] else: if not self.share_dropout: mask_fw = dropout(tf.ones([batch_size, 1, input_size_], dtype=tf.float32), keep_prob=keep_prob, training=training, mode=None) else: if self.dropout_mask_fw[layer] is None or ( tf.executing_eagerly() and batch_size != self.dropout_mask_fw[layer].shape[0]): mask_fw = dropout( tf.ones([batch_size, 1, input_size_], dtype=tf.float32), keep_prob=keep_prob, training=training, mode=None) self.dropout_mask_fw[layer] = mask_fw else: mask_fw = self.dropout_mask_fw[layer] inputs_fw = outputs[-1] * mask_fw else: inputs_fw = dropout(outputs[-1], keep_prob=keep_prob, training=training, mode=None) # https://stackoverflow.com/questions/48233400/lstm-initial-state-from-dense-layer # gru and lstm different ... state lstm need tuple (,) states as input state\ if self.cell == 'gru': out_fw, state_fw = gru_fw(inputs_fw, init_fw) else: out_fw, state_fw1, state_fw2 = gru_fw(inputs_fw, init_fw) state_fw = (state_fw1, state_fw2) if self.train_init_state: #init_bw = tf.tile(self.init_bw[layer], [batch_size, 1]) #init_bw = tf.tile(self.init_bw_layer(layer), [batch_size, 1]) init_bw = self.init_bw_layer(layer, batch_size) if self.cell == 'lstm': init_bw = (init_bw, self.init_bw2_layer(layer, batch_size)) else: init_bw = None if mask_bws is not None: mask_bw = mask_bws[layer] else: if not self.share_dropout: mask_bw = dropout(tf.ones([batch_size, 1, input_size_], dtype=tf.float32), keep_prob=keep_prob, training=training, mode=None) else: if self.dropout_mask_bw[layer] is None or ( tf.executing_eagerly() and batch_size != self.dropout_mask_bw[layer].shape[0]): mask_bw = dropout(tf.ones([batch_size, 1, input_size_], dtype=tf.float32), keep_prob=keep_prob, training=training, mode=None) self.dropout_mask_bw[layer] = mask_bw else: mask_bw = self.dropout_mask_bw[layer] if self.recurrent_dropout: inputs_bw = outputs[-1] * mask_bw else: if self.bw_dropout: inputs_bw = dropout(outputs[-1], keep_prob=keep_prob, training=training, mode=None) else: inputs_bw = inputs_fw inputs_bw = tf.reverse_sequence(inputs_bw, seq_lengths=sequence_length, seq_axis=1, batch_axis=0) if self.cell == 'gru': out_bw, state_bw = gru_bw(inputs_bw, init_bw) else: out_bw, state_bw1, state_bw2 = gru_bw(inputs_bw, init_bw) state_bw = (state_bw1, state_bw2) out_bw = tf.reverse_sequence(out_bw, seq_lengths=sequence_length, seq_axis=1, batch_axis=0) outputs.append(tf.concat([out_fw, out_bw], axis=2)) if self.residual_connect: outputs[-1] = self.batch_norm(outputs[-2] + outputs[-1]) if concat_layers: res = tf.concat(outputs[1:], axis=2) else: res = outputs[-1] res = encode_outputs(res, output_method=output_method, sequence_length=sequence_length) self.state = (state_fw, state_bw) if not self.return_state: return res else: return res, self.state
def call(self, x, training=False): x_proj = dropout(self.linear1(x), keep_prob=self.keep_prob, training=training) x_proj = self.linear2(x_proj) return x_proj
def call(self, input, training=False): q = input['query'] c = input['passage'] # reverse worse if FLAGS.cq_reverse: q, c = c, q #print(input['type']) # print('q', q) # print('c', c) q_len = melt.length(q) c_len = melt.length(c) q_mask = tf.cast(q, tf.bool) c_mask = tf.cast(c, tf.bool) q_emb = self.embedding(q) c_emb = self.embedding(c) x = c_emb batch_size = melt.get_shape(x, 0) if FLAGS.share_dropout: num_units = [ melt.get_shape(x, -1) if layer == 0 else 2 * self.num_units for layer in range(self.num_layers) ] mask_fws = [ melt.dropout(tf.ones([batch_size, 1, num_units[layer]], dtype=tf.float32), keep_prob=self.keep_prob, training=training, mode=None) for layer in range(self.num_layers) ] mask_bws = [ melt.dropout(tf.ones([batch_size, 1, num_units[layer]], dtype=tf.float32), keep_prob=self.keep_prob, training=training, mode=None) for layer in range(self.num_layers) ] # NOTICE query and passage share same drop out, so same word still has same embedding vector after dropout in query and passage c = self.encode(c_emb, c_len, mask_fws=mask_fws, mask_bws=mask_bws, training=training) q = self.encode(q_emb, q_len, mask_fws=mask_fws, mask_bws=mask_bws, training=training) else: c = self.encode(c_emb, c_len, training=training) q = self.encode(q_emb, q_len, training=training) # helps a lot using qc att, now bidaf att worse.. for i in range(FLAGS.hop): if not FLAGS.use_bidaf_att: x = self.att_dot_attentions[i](c, q, mask=q_mask, training=training) else: x = self.att_dot_attentions[i](c, q, c_mask, q_mask, training=training) if FLAGS.use_att_encode: x = self.att_encodes[i](x, c_len, training=training) x = self.match_dot_attentions[i](x, x, mask=c_mask, training=training) #x = self.match_dot_attentions[i](x, mask=c_mask, training=training) x = self.match_encodes[i](x, c_len, training=training) x = self.pooling(x, c_len, calc_word_scores=self.debug) if FLAGS.use_type: x = tf.concat([x, tf.expand_dims(tf.to_float(input['type']), 1)], 1) # might helps ensemble if FLAGS.use_answer_emb: x1 = x neg = input['candidate_neg'] pos = input['candidate_pos'] na = input['candidate_na'] neg_len = melt.length(neg) pos_len = melt.length(pos) na_len = melt.length(na) neg_emb = self.embedding(neg) pos_emb = self.embedding(pos) na_emb = self.embedding(na) if FLAGS.share_dropout: neg = self.encode(neg_emb, neg_len, mask_fws=mask_fws, mask_bws=mask_bws, training=training) pos = self.encode(pos_emb, pos_len, mask_fws=mask_fws, mask_bws=mask_bws, training=training) na = self.encode(na_emb, na_len, mask_fws=mask_fws, mask_bws=mask_bws, training=training) else: neg = self.encode(neg_emb, neg_len, training=training) pos = self.encode(pos_emb, pos_len, training=training) na = self.encode(na_emb, na_len, training=training) neg = self.pooling(neg, neg_len) pos = self.pooling(pos, pos_len) na = self.pooling(na, na_len) answer = tf.stack([neg, pos, na], 1) # [batch_size, emb_dim] x = self.context_dense(x) # [batch_size, 3, emb_dim] answer = self.answer_dense(answer) x = tf.matmul(answer, tf.transpose(tf.expand_dims(x, 1), [0, 2, 1])) x = tf.reshape(x, [batch_size, NUM_CLASSES]) x = tf.concat([x1, x], -1) #return x # not help if FLAGS.combine_query: q = self.pooling(q, q_len) x = tf.concat([x, q], -1) if not FLAGS.use_label_emb: # split logits by type is useful, especially for type1, and improve a lot with type1 only finetune if not FLAGS.split_type: x = self.logits(x) else: x1 = self.logits(x) x2 = self.logits2(x) mask = tf.expand_dims(tf.to_float(tf.equal(input['type'], 0)), 1) x = x1 * mask + x2 * (1 - mask) else: # use label emb seems not help ? x = self.label_dense(x) # TODO.. x = melt.dot(x, self.label_embedding(None)) return x