def call(self, inputs, **kwargs): """Following the routing algorithm from Hinton's paper, but replace b = b + <u,v> with b = <u,v>. This change can improve the feature representation of the capsule. However, you can replace b = K.batch_dot(outputs, hat_inputs, [2, 3]) with b += K.batch_dot(outputs, hat_inputs, [2, 3]) to get standard routing. """ if self.share_weights: hat_inputs = K.conv1d(inputs, self.kernel) else: hat_inputs = K.local_conv1d(inputs, self.kernel, [1], [1]) batch_size = K.shape(inputs)[0] input_num_capsule = K.shape(inputs)[1] hat_inputs = K.reshape(hat_inputs, (batch_size, input_num_capsule, self.num_capsule, self.dim_capsule)) hat_inputs = K.permute_dimensions(hat_inputs, (0, 2, 1, 3)) b = K.zeros_like(hat_inputs[:, :, :, 0]) print(self.routings) for i in range(self.routings): c = K.softmax(b, 1) o = self.activation(K.batch_dot(c, hat_inputs, [2, 2])) if i < self.routings - 1: b = K.batch_dot(o, hat_inputs, [2, 3]) if K.backend() == 'theano': o = K.sum(o, axis=1) return o
def call(self, input): for i in range(self.num_layer): if i == 0: cross = Lambda(lambda x: Add()([K.sum(self.W[i] * K.batch_dot(K.reshape(x, (-1, self.input_dim, 1)), x), 1, keepdims = True), self.bias[i], x]))(input) else: cross = Lambda(lambda x: Add()([K.sum(self.W[i] * K.batch_dot(K.reshape(x, (-1, self.input_dim, 1)), input), 1, keepdims = True), self.bias[i], input]))(cross) return Flatten()(cross)
def self_attn_block(inp, n_c, squeeze_factor=8): """ GAN Self Attention Block Code borrows from https://github.com/taki0112/Self-Attention-GAN-Tensorflow """ msg = "Input channels must be >= {}, recieved nc={}".format(squeeze_factor, n_c) assert n_c // squeeze_factor > 0, msg var_x = inp shape_x = var_x.get_shape().as_list() var_f = Conv2D(n_c // squeeze_factor, 1, kernel_regularizer=regularizers.l2(GAN22_REGULARIZER))(var_x) var_g = Conv2D(n_c // squeeze_factor, 1, kernel_regularizer=regularizers.l2(GAN22_REGULARIZER))(var_x) var_h = Conv2D(n_c, 1, kernel_regularizer=regularizers.l2(GAN22_REGULARIZER))(var_x) shape_f = var_f.get_shape().as_list() shape_g = var_g.get_shape().as_list() shape_h = var_h.get_shape().as_list() flat_f = Reshape((-1, shape_f[-1]))(var_f) flat_g = Reshape((-1, shape_g[-1]))(var_g) flat_h = Reshape((-1, shape_h[-1]))(var_h) var_s = Lambda(lambda var_x: K.batch_dot(var_x[0], Permute((2, 1))(var_x[1])))([flat_g, flat_f]) beta = Softmax(axis=-1)(var_s) var_o = Lambda(lambda var_x: K.batch_dot(var_x[0], var_x[1]))([beta, flat_h]) var_o = Reshape(shape_x[1:])(var_o) var_o = Scale()(var_o) out = add([var_o, inp]) return out
def simple_context(X, mask, n=activation_rnn_size): """Reduce the input just to its headline part (second half). For each word in this part it concatenate the output of the previous layer (RNN) with a weighted average of the outputs of the description part. In this only the last `rnn_size - activation_rnn_size` are used from each output. The first `activation_rnn_size` output is used to computer the weights for the averaging. """ desc, head = X[:, :maxlend, :], X[:, maxlend:, :] head_activations, head_words = head[:, :, :n], head[:, :, n:] desc_activations, desc_words = desc[:, :, :n], desc[:, :, n:] # RTFM http://deeplearning.net/software/theano/library/tensor/basic.html#theano.tensor.batched_tensordot # activation for every head word and every desc word activation_energies = K.batch_dot(head_activations, desc_activations, axes=(2, 2)) # make sure we dont use description words that are masked out activation_energies = activation_energies + -1e20 * K.expand_dims( 1. - K.cast(mask[:, :maxlend], 'float32'), 1) # for every head word compute weights for every desc word activation_energies = K.reshape(activation_energies, (-1, maxlend)) activation_weights = K.softmax(activation_energies) activation_weights = K.reshape(activation_weights, (-1, maxlenh, maxlend)) # for every head word compute weighted average of desc words desc_avg_word = K.batch_dot(activation_weights, desc_words, axes=(2, 1)) return K.concatenate((desc_avg_word, head_words))
def call(self, x): assert isinstance(x, list) inp_a, inp_b = x outp_a = K.l2_normalize(inp_a, -1) outp_b = K.l2_normalize(inp_b, -1) alpha = K.batch_dot(outp_b, outp_a, axes=[2, 2]) alpha = K.l2_normalize(alpha, 1) alpha = K.one_hot(K.argmax(alpha, 1), K.int_shape(inp_a)[1]) hmax = K.batch_dot(alpha, outp_b, axes=[1, 1]) kcon = K.eye(K.int_shape(inp_a)[1], dtype='float32') m = [] for i in range(self.output_dim): outp_a = inp_a * self.W[i] outp_hmax = hmax * self.W[i] outp_a = K.l2_normalize(outp_a, -1) outp_hmax = K.l2_normalize(outp_hmax, -1) outp = K.batch_dot(outp_hmax, outp_a, axes=[2, 2]) outp = K.sum(outp * kcon, -1, keepdims=True) m.append(outp) if self.output_dim > 1: persp = K.concatenate(m, 2) else: persp = m return [persp, persp]
def A_network_output(x): # The input of this layer is [L, mu, a] in concatenated form. We first split # those up. idx = 0 L_flat = x[:, idx:idx + (self.nb_actions * self.nb_actions + self.nb_actions) / 2] idx += (self.nb_actions * self.nb_actions + self.nb_actions) / 2 mu = x[:, idx:idx + self.nb_actions] idx += self.nb_actions a = x[:, idx:idx + self.nb_actions] idx += self.nb_actions # Create L and L^T matrix, which we use to construct the positive-definite matrix P. Ls = [] LTs = [] for idx in xrange(self.batch_size): L = K.zeros((self.nb_actions, self.nb_actions)) L = T.set_subtensor(L[np.tril_indices(self.nb_actions)], L_flat[idx, :]) diag = K.exp(T.diag(L)) L = T.set_subtensor(L[np.diag_indices(self.nb_actions)], diag) Ls.append(L) LTs.append(K.transpose(L)) # TODO: diagonal elements exp L = K.pack(Ls) LT = K.pack(LTs) P = K.batch_dot(L, LT, axes=(1, 2)) assert K.ndim(P) == 3 # Combine a, mu and P into a scalar (over the batches). A = -.5 * K.batch_dot(K.batch_dot(a - mu, P, axes=(1, 2)), a - mu, axes=1) assert K.ndim(A) == 2 return A
def call(self, x): #如果只传入Q_seq,K_seq,V_seq,那么就不做Mask #如果同时传入Q_seq,K_seq,V_seq,Q_len,V_len,那么对多余部分做Mask if len(x) == 3: Q_seq,K_seq,V_seq = x Q_len,V_len = None,None elif len(x) == 5: Q_seq,K_seq,V_seq,Q_len,V_len = x #对Q、K、V做线性变换 Q_seq = K.dot(Q_seq, self.WQ) Q_seq = K.reshape(Q_seq, (-1, K.shape(Q_seq)[1], self.nb_head, self.size_per_head)) Q_seq = K.permute_dimensions(Q_seq, (0,2,1,3)) K_seq = K.dot(K_seq, self.WK) K_seq = K.reshape(K_seq, (-1, K.shape(K_seq)[1], self.nb_head, self.size_per_head)) K_seq = K.permute_dimensions(K_seq, (0,2,1,3)) V_seq = K.dot(V_seq, self.WV) V_seq = K.reshape(V_seq, (-1, K.shape(V_seq)[1], self.nb_head, self.size_per_head)) V_seq = K.permute_dimensions(V_seq, (0,2,1,3)) #计算内积,然后mask,然后softmax A = K.batch_dot(Q_seq, K_seq, axes=[3,3]) A = K.permute_dimensions(A, (0,3,2,1)) A = self.Mask(A, V_len, 'add') A = K.permute_dimensions(A, (0,3,2,1)) A = K.softmax(A) #输出并mask O_seq = K.batch_dot(A, V_seq, axes=[3,2]) O_seq = K.permute_dimensions(O_seq, (0,2,1,3)) O_seq = K.reshape(O_seq, (-1, K.shape(O_seq)[1], self.output_dim)) O_seq = self.Mask(O_seq, Q_len, 'mul') return O_seq
def call(self, x, mask=None): q, k, v = x d_k = q.shape.as_list()[2] # in pure tensorflow: # weights = tf.matmul(x_batch, tf.transpose(y_batch, perm=[0, 2, 1])) # normalized_weights = tf.nn.softmax(weights/scaling) # output = tf.matmul(normalized_weights, x_batch) weights = K.batch_dot(q, k, axes=[2, 2]) if mask is not None: # add mask weights if isinstance(mask, (list, tuple)): if len(mask) > 0: raise ValueError("mask can only be a Tensor or a list of length 1 containing a tensor.") mask = mask[0] weights += -1e10*(1-mask) normalized_weights = K.softmax(weights / np.sqrt(d_k)) output = K.batch_dot(normalized_weights, v) if self._return_attention: return [output, normalized_weights] else: return output
def recurrence(y_i, h): h_permute = K.permute_dimensions(h, [0, 2, 1]) # (batch_size, encoding_dim, input_length) e = K.l2_normalize( K.batch_dot(h_permute, s, axes=1), # (batch_size, input_length) axis=1) # (batch_size, input_length) # eqn 6 alpha = K.softmax(e) # (batch_size, input_length) # eqn 5 c = K.batch_dot(h, alpha, axes=1) # (batch_size, encoding_dim) recurrence_result = K.expand_dims( K.concatenate([c, y_i], axis=1), dim=1) # (batch_size, 1, 2 * encoding_dim) expanded_h = Input(shape=(1, 2 * encoding_dim), name='expanded_h') gru = Sequential([ GRU(output_dim, return_sequences=False, input_shape=(1, 2 * encoding_dim)) ]) model = Model(input=[expanded_h], output=[gru(expanded_h)]) # (batch_size, 1, output_dim) return model(recurrence_result)
def semantic_matrix(argv): assert len(argv) == 2 q = argv[0] a = argv[1] q_sqrt = K.sqrt((q ** 2).sum(axis=2, keepdims=True)) a_sqrt = K.sqrt((a ** 2).sum(axis=2, keepdims=True)) denominator = K.batch_dot(q_sqrt, K.permute_dimensions(a_sqrt, [0,2,1])) return K.batch_dot(q, K.permute_dimensions(a, [0,2,1])) / (denominator + SAFE_EPSILON)
def call(self, x, mask=None): stride_row, stride_col = self.subsample _, feature_dim, nb_filter = self.W_shape if self.dim_ordering == 'th': if K._backend == 'theano': output = [] for i in range(self.output_row): for j in range(self.output_col): slice_row = slice(i * stride_row, i * stride_row + self.nb_row) slice_col = slice(j * stride_col, j * stride_col + self.nb_col) x_flatten = K.reshape(x[:, :, slice_row, slice_col], (1, -1, feature_dim)) output.append(K.dot(x_flatten, self.W[i * self.output_col + j, :, :])) output = K.concatenate(output, axis=0) else: xs = [] for i in range(self.output_row): for j in range(self.output_col): slice_row = slice(i * stride_row, i * stride_row + self.nb_row) slice_col = slice(j * stride_col, j * stride_col + self.nb_col) xs.append(K.reshape(x[:, :, slice_row, slice_col], (1, -1, feature_dim))) x_aggregate = K.concatenate(xs, axis=0) output = K.batch_dot(x_aggregate, self.W) output = K.reshape(output, (self.output_row, self.output_col, -1, nb_filter)) output = K.permute_dimensions(output, (2, 3, 0, 1)) elif self.dim_ordering == 'tf': xs = [] for i in range(self.output_row): for j in range(self.output_col): slice_row = slice(i * stride_row, i * stride_row + self.nb_row) slice_col = slice(j * stride_col, j * stride_col + self.nb_col) xs.append(K.reshape(x[:, slice_row, slice_col, :], (1, -1, feature_dim))) x_aggregate = K.concatenate(xs, axis=0) output = K.batch_dot(x_aggregate, self.W) output = K.reshape(output, (self.output_row, self.output_col, -1, nb_filter)) output = K.permute_dimensions(output, (2, 0, 1, 3)) else: raise Exception('Invalid dim_ordering: ' + self.dim_ordering) if self.bias: if self.dim_ordering == 'th': output += K.reshape(self.b, (1, nb_filter, self.output_row, self.output_col)) elif self.dim_ordering == 'tf': output += K.reshape(self.b, (1, self.output_row, self.output_col, nb_filter)) else: raise Exception('Invalid dim_ordering: ' + self.dim_ordering) output = self.activation(output) return output
def call(self, inputs, mask=None): if type(inputs) is not list or len(inputs) <= 1: raise Exception('Merge must be called on a list of tensors ' '(at least 2). Got: ' + str(inputs)) # case: "mode" is a lambda or function. if hasattr(self.mode, '__call__'): # TODO: consider making it possible to # pass custom arguments to lambda. arguments = {} return self.mode(inputs, **arguments) if self.mode == 'sum' or self.mode == 'ave': s = inputs[0] for i in range(1, len(inputs)): s += inputs[i] if self.mode == 'ave': s /= len(inputs) return s elif self.mode == 'concat': return K.concatenate(inputs, axis=self.concat_axis) elif self.mode == 'mul': s = inputs[0] for i in range(1, len(inputs)): s *= inputs[i] return s elif self.mode == 'dot': l1 = inputs[0] l2 = inputs[1] output = K.batch_dot(l1, l2, self.dot_axes) return output elif self.mode == 'cos': l1 = inputs[0] l2 = inputs[1] denominator = K.sqrt(K.batch_dot(l1, l1, self.dot_axes) * K.batch_dot(l2, l2, self.dot_axes)) output = K.batch_dot(l1, l2, self.dot_axes) / denominator output = K.expand_dims(output, 1) return output elif self.mode == 'abs': s = inputs[0] * inputs[0] for i in range(1, len(inputs)): s += inputs[i] * inputs[i] return K.sqrt(s) elif self.mode == 'atan2': return T.tensor.arctan2(inputs[1], inputs[0]) else: raise Exception('Unknown merge mode.')
def euclidDist( inputs ): assert len( inputs ) == 2, "euclidDist requires 2 inputs" l1 = inputs[ 0 ] l2 = inputs[ 1 ] x = l1 - l2 output = K.batch_dot( x, x, axes = 1 ) K.reshape( output, (1,) ) return output
def call(self, x, mask = None): tupleEmbed = self.tupleEmbed relationEmbed = self.relationEmbed nb = x.shape[0] entity_embeddings = tupleEmbed[x[:, 0], x[:, 2]] relation_embeddings = relationEmbed[x[:, 1]] dot_prod = K.batch_dot(entity_embeddings, relation_embeddings, axes = 1) return self.activation(dot_prod)
def step(self, x, states): ytm, stm = states # repeat the hidden state to the length of the sequence _stm = K.repeat(stm, self.timesteps) # now multiplty the weight matrix with the repeated hidden state _Wxstm = K.dot(_stm, self.W_a) # calculate the attention probabilities # this relates how much other timesteps contributed to this one. et = K.dot(activations.tanh(_Wxstm + self._uxpb), K.expand_dims(self.V_a)) at = K.exp(et) at_sum = K.sum(at, axis=1) at_sum_repeated = K.repeat(at_sum, self.timesteps) at /= at_sum_repeated # vector of size (batchsize, timesteps, 1) # calculate the context vector context = K.squeeze(K.batch_dot(at, self.x_seq, axes=1), axis=1) # ~~~> calculate new hidden state # first calculate the "r" gate: rt = activations.sigmoid( K.dot(ytm, self.W_r) + K.dot(stm, self.U_r) + K.dot(context, self.C_r) + self.b_r) # now calculate the "z" gate zt = activations.sigmoid( K.dot(ytm, self.W_z) + K.dot(stm, self.U_z) + K.dot(context, self.C_z) + self.b_z) # calculate the proposal hidden state: s_tp = activations.tanh( K.dot(ytm, self.W_p) + K.dot((rt * stm), self.U_p) + K.dot(context, self.C_p) + self.b_p) # new hidden state: st = (1-zt)*stm + zt * s_tp yt = activations.softmax( K.dot(ytm, self.W_o) + K.dot(stm, self.U_o) + K.dot(context, self.C_o) + self.b_o) if self.return_probabilities: return at, [yt, st] else: return yt, [yt, st]
def get_similarity(self): ''' Specify similarity in configuration under 'similarity_params' -> 'mode' If a parameter is needed for the model, specify it in 'similarity_params' Example configuration: config = { ... other parameters ... 'similarity_params': { 'mode': 'gesd', 'gamma': 1, 'c': 1, } } cosine: dot(a, b) / sqrt(dot(a, a) * dot(b, b)) polynomial: (gamma * dot(a, b) + c) ^ d sigmoid: tanh(gamma * dot(a, b) + c) rbf: exp(-gamma * l2_norm(a-b) ^ 2) euclidean: 1 / (1 + l2_norm(a - b)) exponential: exp(-gamma * l2_norm(a - b)) gesd: euclidean * sigmoid aesd: (euclidean + sigmoid) / 2 ''' params = self.similarity_params similarity = params['mode'] axis = lambda a: len(a._keras_shape) - 1 dot = lambda a, b: K.batch_dot(a, b, axes=axis(a)) l2_norm = lambda a, b: K.sqrt(K.sum((a - b) ** 2, axis=axis(a), keepdims=True)) l1_norm = lambda a, b: K.sum(K.abs(a - b), axis=axis(a), keepdims=True) if similarity == 'cosine': return lambda x: dot(x[0], x[1]) / K.sqrt(dot(x[0], x[0]) * dot(x[1], x[1])) elif similarity == 'polynomial': return lambda x: (params['gamma'] * dot(x[0], x[1]) + params['c']) ** params['d'] elif similarity == 'sigmoid': return lambda x: K.tanh(params['gamma'] * dot(x[0], x[1]) + params['c']) elif similarity == 'rbf': return lambda x: K.exp(-1 * params['gamma'] * l2_norm(x[0], x[1]) ** 2) elif similarity == 'euclidean': return lambda x: 1 / (1 + l2_norm(x[0], x[1])) elif similarity == 'l1': return lambda x: -l1_norm(x[0], x[1]) elif similarity == 'exponential': return lambda x: K.exp(-1 * params['gamma'] * l2_norm(x[0], x[1])) elif similarity == 'gesd': euclidean = lambda x: 1 / (1 + l2_norm(x[0], x[1])) sigmoid = lambda x: 1 / (1 + K.exp(-1 * params['gamma'] * (dot(x[0], x[1]) + params['c']))) return lambda x: euclidean(x) * sigmoid(x) elif similarity == 'aesd': euclidean = lambda x: 0.5 / (1 + l2_norm(x[0], x[1])) sigmoid = lambda x: 0.5 / (1 + K.exp(-1 * params['gamma'] * (dot(x[0], x[1]) + params['c']))) return lambda x: euclidean(x) + sigmoid(x) else: raise Exception('Invalid similarity: {}'.format(similarity))
def call(self, inputs, training=None): # inputs.shape=[None, input_num_capsule, input_dim_capsule] # inputs_expand.shape=[None, 1, input_num_capsule, input_dim_capsule] inputs_expand = K.expand_dims(inputs, 1) # Replicate num_capsule dimension to prepare being multiplied by W # inputs_tiled.shape=[None, num_capsule, input_num_capsule, input_dim_capsule] inputs_tiled = K.tile(inputs_expand, [1, self.num_capsule, 1, 1]) # Compute `inputs * W` by scanning inputs_tiled on dimension 0. # x.shape=[num_capsule, input_num_capsule, input_dim_capsule] # W.shape=[num_capsule, input_num_capsule, dim_capsule, input_dim_capsule] # Regard the first two dimensions as `batch` dimension, # then matmul: [input_dim_capsule] x [dim_capsule, input_dim_capsule]^T -> [dim_capsule]. # inputs_hat.shape = [None, num_capsule, input_num_capsule, dim_capsule] inputs_hat = K.map_fn(lambda x: K.batch_dot(x, self.W, [2, 3]), elems=inputs_tiled) # Begin: Routing algorithm ---------------------------------------------------------------------# # The prior for coupling coefficient, initialized as zeros. # b.shape = [None, self.num_capsule, self.input_num_capsule]. b = tf.zeros(shape=[K.shape(inputs_hat)[0], self.num_capsule, self.input_num_capsule]) assert self.routings > 0, 'The routings should be > 0.' for i in range(self.routings): # c.shape=[batch_size, num_capsule, input_num_capsule] c = tf.nn.softmax(b, dim=1) # c.shape = [batch_size, num_capsule, input_num_capsule] # inputs_hat.shape=[None, num_capsule, input_num_capsule, dim_capsule] # The first two dimensions as `batch` dimension, # then matmal: [input_num_capsule] x [input_num_capsule, dim_capsule] -> [dim_capsule]. # outputs.shape=[None, num_capsule, dim_capsule] outputs = squash(K.batch_dot(c, inputs_hat, [2, 2])) # [None, 10, 16] if i < self.routings - 1: # outputs.shape = [None, num_capsule, dim_capsule] # inputs_hat.shape=[None, num_capsule, input_num_capsule, dim_capsule] # The first two dimensions as `batch` dimension, # then matmal: [dim_capsule] x [input_num_capsule, dim_capsule]^T -> [input_num_capsule]. # b.shape=[batch_size, num_capsule, input_num_capsule] b += K.batch_dot(outputs, inputs_hat, [2, 3]) # End: Routing algorithm -----------------------------------------------------------------------# return outputs
def _normalize_attention(attmat): att = attmat[0] mat = attmat[1] if transpose: att = K.permute_dimensions(att,(0, 2, 1)) # 3d softmax e = K.exp(att - K.max(att, axis=-1, keepdims=True)) s = K.sum(e, axis=-1, keepdims=True) sm_att = e / s return K.batch_dot(sm_att, mat)
def call(self, x, mask = None): # x is of dimension nb x (2*context_size + 3) where x[:,0] are the words, x[:,3:] is the context, x[:,1] is the context word and x[:,2] is the INDEX of the context word in context W_g = self.W_g W_s = self.W_s nb = x.shape[0] context_length = self.input_dim - 3 actual_word_indx = (self.input_dim+3)/2 #same as context + 3 right_senses,ignore_updates = theano.scan(disambiguate, sequences = x[:,3:], non_sequences = [context_length, W_g, W_s]) words_sense_vector = W_s[x[:,0], right_senses[:,actual_word_indx]] a, ignore_updates = theano.scan(get_sense_vector, sequences = [x[:,0],right_senses,x[:,2]], non_sequences = [W_g, W_s]) contexts_sense_vector = a sense_dot_prod = K.batch_dot(words_sense_vector, contexts_sense_vector, axes = 1) global_dot_prod = K.batch_dot(W_g[x[:,0]], W_g[x[:,1]], axes = 1) dot_prod = sense_dot_prod + global_dot_prod # return a[0] # return self.activation(K.log(K.sigmoid(dot_prod))) return self.activation(dot_prod)
def cos_sim_matvec(values): mat, vec = values #mat = K.l2_normalize(mat, axis=-1) #vec = K.l2_normalize(vec, axis=-1) mat = myl2(mat, axis=-1) vec = myl2(vec, axis=-1) dodo = K.batch_dot(mat,vec,axes=[2,1]) #dodo = dodo.dimshuffle((0,1,'x')) #return T.extra_ops.repeat() return dodo
def _additive_similarity(self, source, query): concatenation = K.concatenate([source, query], axis=2) nonlinearity = K.tanh(K.dot(concatenation, self._weights["w_a"])) # tile the weight vector (1, 1, dim) for each time step and each element of the batch -> (bs, T, dim) source_shape = K.shape(source) vaeff = K.tile(K.expand_dims(self._weights["v_a"], 0), [source_shape[0], source_shape[1], 1]) similarity = K.batch_dot(K.permute_dimensions(vaeff, [0, 2, 1]), nonlinearity, axes=[1, 2]) return similarity
def call(self,inputs,**kwargs): if type(inputs) is list: assert len(inputs) == 2 inputs,mask = inputs else: x = inputs # enlarge range of values in x by mapping max(new_x) = 1, others x = (x - K.max(x,1,True)) / K.epsilon() + 1 mask = K.clip(x,self.clip_value[0],self.clip_value[1]) # clip value beween 0 and 1 masked_input = K.batch_dot(inputs, mask, [1,1]) return masked_input
def _transform(self, X, affine_transformation, output_size): batch_size, num_channels = K.shape(X)[0], K.shape(X)[3] transformations = K.reshape(affine_transformation, shape=(batch_size, 2, 3)) # transformations = K.cast(affine_transformation[:, 0:2, :], 'float32') regular_grids = self._make_regular_grids(batch_size, *output_size) sampled_grids = K.batch_dot(transformations, regular_grids) interpolated_image = self._interpolate(X, sampled_grids, output_size) new_shape = (batch_size, output_size[0], output_size[1], num_channels) interpolated_image = K.reshape(interpolated_image, new_shape) return interpolated_image
def call(self, xy, mask=None): if not isinstance(xy, list) or len(xy) != 2: raise Exception('Inner attention must be called on 2 inputs.' ' Got: ' + str(xy)) x, y = xy assert K.ndim(x) == 3, "x should be 3d (m x d), but got %d"%(K.ndim(x)) assert K.ndim(y) == 3, "y should be 3d (n x d), but got %d"%(K.ndim(y)) #assert d1 == d2, "x and y should be of same dimension, but dim(x)=%d, dim(y)=%d"%(d1, d2) z = K.batch_dot(x, y, axes=2) # Softmax e = K.exp(z - K.max(z, axis=-1, keepdims=True)) s = K.sum(e, axis=-1, keepdims=True) z = e / s # z should be 10 z = K.batch_dot(z, x, axes=1) return z
def routing(u_hat_vecs, beta_a, iterations, output_capsule_num, i_activations): b = keras.backend.zeros_like(u_hat_vecs[:,:,:,0]) if i_activations is not None: i_activations = i_activations[...,tf.newaxis] for i in range(iterations): if False: leak = tf.zeros_like(b, optimize=True) leak = tf.reduce_sum(leak, axis=1, keep_dims=True) leaky_logits = tf.concat([leak, b], axis=1) leaky_routing = tf.nn.softmax(leaky_logits, dim=1) c = tf.split(leaky_routing, [1, output_capsule_num], axis=1)[1] else: c = softmax(b, 1) # if i_activations is not None: # tf.transpose(tf.transpose(c, perm=[0,2,1]) * i_activations, perm=[0,2,1]) outputs = squash_v1(K.batch_dot(c, u_hat_vecs, [2, 2])) if i < iterations - 1: b = b + K.batch_dot(outputs, u_hat_vecs, [2, 3]) poses = outputs activations = K.sqrt(K.sum(K.square(poses), 2)) return poses, activations
def call(self, X, mask=None): input_shape = self.input_spec[0].shape w = self.attention input_length = K.shape(X)[1] X = K.reshape(X, (-1, ) + input_shape[2:]) # (nb_samples * timesteps, ...) w = K.reshape(w, (-1, ) + input_shape[2:3]) # (nb_samples * timesteps, ...) y = K.batch_dot(w,X) # Not sure why this is not working, but I should use the layer #y = self.layer.call(w,X) # (nb_samples, timesteps, ...) y = K.reshape(y, (-1, input_length) + input_shape[3:]) return y
def call(self, x): source, query = x similarity = self._similarity(source, query) expected_similarity_shape = [source.shape.as_list()[0], source.shape.as_list()[1], source.shape.as_list()[1]] if similarity.shape.as_list() != expected_similarity_shape: raise RuntimeError("The similarity function has returned a similarity with shape {0}, but expected {1}".format(similarity.shape.as_list()[:2], expected_similarity_shape)) score = K.softmax(similarity) output = K.batch_dot(score, source, axes=[1, 1]) return output
def _pairwise_distances(self, inputs: List[Tensor]) -> Tensor: emb_c, emb_r = inputs bs = K.shape(emb_c)[0] embeddings = K.concatenate([emb_c, emb_r], 0) dot_product = K.dot(embeddings, K.transpose(embeddings)) square_norm = K.batch_dot(embeddings, embeddings, axes=1) distances = K.transpose(square_norm) - 2.0 * dot_product + square_norm distances = K.slice(distances, (0, bs), (bs, bs)) distances = K.clip(distances, 0.0, None) mask = K.cast(K.equal(distances, 0.0), K.dtype(distances)) distances = distances + mask * 1e-16 distances = K.sqrt(distances) distances = distances * (1.0 - mask) return distances
def call(self, inputs, **kwargs): # use true label to select target capsule, shape=[batch_size, num_capsule] if type(inputs) is list: # true label is provided with shape = [batch_size, n_classes], i.e. one-hot code. assert len(inputs) == 2 inputs, mask = inputs else: # if no true label, mask by the max length of vectors of capsules x = inputs # Enlarge the range of values in x to make max(new_x)=1 and others < 0 x = (x - K.max(x, 1, True)) / K.epsilon() + 1 mask = K.clip(x, 0, 1) # the max value in x clipped to 1 and other to 0 # masked inputs, shape = [batch_size, dim_vector] inputs_masked = K.batch_dot(inputs, mask, [1, 1]) return inputs_masked
def call(self, inputs, training=None): # inputs.shape=[None, input_num_capsule, input_dim_vector] # Expand dims to [None, input_num_capsule, 1, 1, input_dim_vector] inputs_expand = K.expand_dims(K.expand_dims(inputs, 2), 2) # Replicate num_capsule dimension to prepare being multiplied by W # Now it has shape = [None, input_num_capsule, num_capsule, 1, input_dim_vector] inputs_tiled = K.tile(inputs_expand, [1, 1, self.num_capsule, 1, 1]) """ # Compute `inputs * W` by expanding the first dim of W. More time-consuming and need batch_size. # Now W has shape = [batch_size, input_num_capsule, num_capsule, input_dim_vector, dim_vector] w_tiled = K.tile(K.expand_dims(self.W, 0), [self.batch_size, 1, 1, 1, 1]) # Transformed vectors, inputs_hat.shape = [None, input_num_capsule, num_capsule, 1, dim_vector] inputs_hat = K.batch_dot(inputs_tiled, w_tiled, [4, 3]) """ # Compute `inputs * W` by scanning inputs_tiled on dimension 0. This is faster but requires Tensorflow. # inputs_hat.shape = [None, input_num_capsule, num_capsule, 1, dim_vector] inputs_hat = tf.scan(lambda ac, x: K.batch_dot(x, self.W, [3, 2]), elems=inputs_tiled, initializer=K.zeros([self.input_num_capsule, self.num_capsule, 1, self.dim_vector])) """ # Routing algorithm V1. Use tf.while_loop in a dynamic way. def body(i, b, outputs): c = tf.nn.softmax(self.bias, dim=2) # dim=2 is the num_capsule dimension outputs = squash(K.sum(c * inputs_hat, 1, keepdims=True)) b = b + K.sum(inputs_hat * outputs, -1, keepdims=True) return [i-1, b, outputs] cond = lambda i, b, inputs_hat: i > 0 loop_vars = [K.constant(self.num_routing), self.bias, K.sum(inputs_hat, 1, keepdims=True)] _, _, outputs = tf.while_loop(cond, body, loop_vars) """ # Routing algorithm V2. Use iteration. V2 and V1 both work without much difference on performance assert self.num_routing > 0, 'The num_routing should be > 0.' for i in range(self.num_routing): c = tf.nn.softmax(self.bias, dim=2) # dim=2 is the num_capsule dimension # outputs.shape=[None, 1, num_capsule, 1, dim_vector] outputs = squash(K.sum(c * inputs_hat, 1, keepdims=True)) # last iteration needs not compute bias which will not be passed to the graph any more anyway. if i != self.num_routing - 1: # self.bias = K.update_add(self.bias, K.sum(inputs_hat * outputs, [0, -1], keepdims=True)) self.bias += K.sum(inputs_hat * outputs, -1, keepdims=True) # tf.summary.histogram('BigBee', self.bias) # for debugging return K.reshape(outputs, [-1, self.num_capsule, self.dim_vector])
def step_do(self, step_in, states): # 定义每一步的迭代 in_value = step_in if 0 < self.dropout < 1.: self._dropout_mask = K.in_train_phase( K.dropout(K.ones_like(step_in), self.dropout), K.ones_like(step_in)) if 0 < self.dropout < 1.: in_value = step_in * self._dropout_mask # hist = K.tanh(K.dot(states[0], self.rec_kernel)) # hist = K.tanh(states[0]) in_value = K.expand_dims(in_value, axis=-2) l_state = K.expand_dims(states[0], axis=-2) l_inp = K.concatenate([l_state, in_value], axis=-2) s_state = K.expand_dims(states[1], axis=-2) s_inp = K.concatenate([s_state, in_value], axis=-2) l_query = K.dot(l_inp, self.query_kernel) l_key = K.dot(l_inp, self.key_kernel) l_value = K.dot(l_inp, self.value_kernel) l_attention_prob = K.batch_dot(l_query, l_key, axes=[2, 2]) / np.sqrt( self.units) print(l_attention_prob.shape) l_attention_prob = K.softmax(l_attention_prob) l_outputs = K.batch_dot(l_attention_prob, l_value) l_outputs = K.tanh(l_outputs) s_query = K.dot(s_inp, self.query_kernel) s_key = K.dot(s_inp, self.key_kernel) s_value = K.dot(s_inp, self.value_kernel) s_attention_prob = K.batch_dot(s_query, s_key, axes=[2, 2]) / np.sqrt( self.units) s_attention_prob = K.softmax(s_attention_prob) s_outputs = K.batch_dot(s_attention_prob, s_value) s_outputs = K.tanh(s_outputs) lt = K.expand_dims(l_outputs[:, 0], axis=-2) st = K.expand_dims(s_outputs[:, 1], axis=-2) outputs = K.concatenate([lt, st], axis=-2) query = K.dot(outputs, self.query_kernel) key = K.dot(outputs, self.key_kernel) value = K.dot(outputs, self.value_kernel) attention_prob = K.batch_dot(query, key, axes=[2, 2]) / np.sqrt( self.units) attention_prob = K.softmax(attention_prob) print(attention_prob.shape) att_out = K.batch_dot(attention_prob, value, axes=[2, 1]) # outputs = K.concatenate([l_outputs[:,0], s_outputs[:,1]], axis=-1) # outputs = 0.5*l_outputs[:,0] + 0.5*s_outputs[:,1] print('inner_outputs.shape', outputs.shape) return att_out[:, 0], [att_out[:, 0], att_out[:, 1]]
def call(self, inputs): if not isinstance(inputs, list): raise ValueError('This layer should be called ' 'on a list of 2/3 inputs.') if len(inputs) != 3 and len(inputs) != 2: raise ValueError('This layer should be called ' 'on a list of 2/3 inputs.' 'Got ' + str(len(inputs)) + ' inputs.') # if len(inputs) != 1: # raise ValueError('This layer should be called ' # 'on only 1 input.' # 'Got ' + str(len(input)) + ' inputs.') input_real = inputs[0] input_imag = inputs[1] ndims = len(inputs[0].shape) if self.average_weights: output_r = K.mean(input_real, axis=ndims - 2, keepdims=False) output_i = K.mean(input_imag, axis=ndims - 2, keepdims=False) else: #For embedding layer inputs[2] is (None, embedding_dim,1) #For test inputs[2] is (None, embedding_dim) if len(inputs[2].shape) == ndims - 1: weight = K.expand_dims(inputs[2]) else: weight = inputs[2] weight = K.repeat_elements(weight, input_real.shape[-1], axis=ndims - 1) output_real = input_real * weight #shape: (None, 300, 300) output_real = K.sum(output_real, axis=ndims - 2) output_imag = input_imag * weight output_imag = K.sum(output_imag, axis=ndims - 2) output_real_transpose = K.expand_dims(output_real, axis=ndims - 2) output_imag_transpose = K.expand_dims(output_imag, axis=ndims - 2) # output_real_transpose = K.permute_dimensions(output_real, (0,2,1)) # output_imag_transpose = K.permute_dimensions(output_imag, (0,2,1)) output_real = K.expand_dims(output_real) output_imag = K.expand_dims(output_imag) print(output_real.shape) print(output_real_transpose.shape) # print(output_imag.shape) output_r = K.batch_dot( output_real, output_real_transpose, axes=[ndims - 1, ndims]) + K.batch_dot( output_imag, output_imag_transpose, axes=[ndims - 1, ndims]) output_i = K.batch_dot( output_imag, output_real_transpose, axes=[ndims - 1, ndims]) - K.batch_dot( output_real, output_imag_transpose, axes=[ndims - 1, ndims]) return [output_r, output_i]
def call(self, inputs): z = inputs # z.shape=(batch_size, latent_dim) z = K.expand_dims(z, 1) return z - K.expand_dims(self.mean, 0) def compute_output_shape(self, input_shape): return (None, self.num_classes, input_shape[-1]) gaussian = Gaussian(num_classes, name='priors') z_prior_mean = gaussian(z) clvae = Model([x, y_in], [x_recon, z_prior_mean]) z_mean = K.expand_dims(z_mean, 1) z_log_var = K.expand_dims(z_log_var, 1) lamb = 0.5 xent_loss = 0.5 * K.mean((x - x_recon)**2, 0) kl_loss = - 0.5 * (z_log_var - K.square(z_prior_mean)) kl_loss = K.mean(K.batch_dot(K.expand_dims(y_in, 1), kl_loss), 0) clvae_loss = lamb * K.sum(xent_loss) + K.sum(kl_loss) clvae.add_loss(clvae_loss) clvae.compile(optimizer='adam') clvae.summary() clvae_history = clvae.fit([x_train, to_categorical(y_train)], shuffle=True, epochs=epochs, batch_size=batch_size, validation_data=([x_test, to_categorical(y_test)], None))
def cosine(self, x): axis = len(x[0]._keras_shape)-1 dot = lambda a, b: K.batch_dot(a, b, axes=axis) return dot(x[0], x[1]) / K.sqrt(dot(x[0], x[0]) * dot(x[1], x[1]))
def _g_var_chol(p, epsilon): mu, var, chol = p epsilon = K.batch_dot(epsilon, chol, axes=(1, 2)) return mu + K.sqrt(K.abs(var)) * epsilon
def _g_logvar_chol_2D1(p, epsilon): mu, logvar, chol = p epsilon = K.batch_dot(epsilon, chol, axes=(1, 1)) return mu + K.exp(logvar/2) * epsilon
def call(self, x, mask=None): # TODO: validate input shape assert (len(x) == 3) L_flat = x[0] mu = x[1] a = x[2] if self.mode == 'full': # Create L and L^T matrix, which we use to construct the positive-definite matrix P. L = None LT = None if K.backend() == 'theano': import theano.tensor as T import theano def fn(x, L_acc, LT_acc): x_ = K.zeros((self.nb_actions, self.nb_actions)) x_ = T.set_subtensor(x_[np.tril_indices(self.nb_actions)], x) diag = K.exp(T.diag(x_)) + K.epsilon() x_ = T.set_subtensor(x_[np.diag_indices(self.nb_actions)], diag) return x_, x_.T outputs_info = [ K.zeros((self.nb_actions, self.nb_actions)), K.zeros((self.nb_actions, self.nb_actions)), ] results, _ = theano.scan(fn=fn, sequences=L_flat, outputs_info=outputs_info) L, LT = results elif K.backend() == 'tensorflow': import tensorflow as tf # Number of elements in a triangular matrix. nb_elems = (self.nb_actions * self.nb_actions + self.nb_actions) // 2 # Create mask for the diagonal elements in L_flat. This is used to exponentiate # only the diagonal elements, which is done before gathering. diag_indeces = [0] for row in range(1, self.nb_actions): diag_indeces.append(diag_indeces[-1] + (row + 1)) diag_mask = np.zeros(1 + nb_elems) # +1 for the leading zero diag_mask[np.array(diag_indeces) + 1] = 1 diag_mask = K.variable(diag_mask) # Add leading zero element to each element in the L_flat. We use this zero # element when gathering L_flat into a lower triangular matrix L. nb_rows = tf.shape(L_flat)[0] zeros = tf.expand_dims(tf.tile(K.zeros((1, )), [nb_rows]), 1) try: # Old TF behavior. L_flat = tf.concat(1, [zeros, L_flat]) except TypeError: # New TF behavior L_flat = tf.concat([zeros, L_flat], 1) # Create mask that can be used to gather elements from L_flat and put them # into a lower triangular matrix. tril_mask = np.zeros((self.nb_actions, self.nb_actions), dtype='int32') tril_mask[np.tril_indices(self.nb_actions)] = range( 1, nb_elems + 1) # Finally, process each element of the batch. init = [ K.zeros((self.nb_actions, self.nb_actions)), K.zeros((self.nb_actions, self.nb_actions)), ] def fn(a, x): # Exponentiate everything. This is much easier than only exponentiating # the diagonal elements, and, usually, the action space is relatively low. x_ = K.exp(x) + K.epsilon() # Only keep the diagonal elements. x_ *= diag_mask # Add the original, non-diagonal elements. x_ += x * (1. - diag_mask) # Finally, gather everything into a lower triangular matrix. L_ = tf.gather(x_, tril_mask) return [L_, tf.transpose(L_)] tmp = tf.scan(fn, L_flat, initializer=init) if isinstance(tmp, (list, tuple)): # TensorFlow 0.10 now returns a tuple of tensors. L, LT = tmp else: # Old TensorFlow < 0.10 returns a shared tensor. L = tmp[:, 0, :, :] LT = tmp[:, 1, :, :] else: raise RuntimeError('Unknown Keras backend "{}".'.format( K.backend())) assert L is not None assert LT is not None P = K.batch_dot(L, LT) elif self.mode == 'diag': if K.backend() == 'theano': import theano.tensor as T import theano def fn(x, P_acc): x_ = K.zeros((self.nb_actions, self.nb_actions)) x_ = T.set_subtensor(x_[np.diag_indices(self.nb_actions)], x) return x_ outputs_info = [ K.zeros((self.nb_actions, self.nb_actions)), ] P, _ = theano.scan(fn=fn, sequences=L_flat, outputs_info=outputs_info) elif K.backend() == 'tensorflow': import tensorflow as tf # Create mask that can be used to gather elements from L_flat and put them # into a diagonal matrix. diag_mask = np.zeros((self.nb_actions, self.nb_actions), dtype='int32') diag_mask[np.diag_indices(self.nb_actions)] = range( 1, self.nb_actions + 1) # Add leading zero element to each element in the L_flat. We use this zero # element when gathering L_flat into a lower triangular matrix L. nb_rows = tf.shape(L_flat)[0] zeros = tf.expand_dims(tf.tile(K.zeros((1, )), [nb_rows]), 1) try: # Old TF behavior. L_flat = tf.concat(1, [zeros, L_flat]) except TypeError: # New TF behavior L_flat = tf.concat([zeros, L_flat], 1) # Finally, process each element of the batch. def fn(a, x): x_ = tf.gather(x, diag_mask) return x_ P = tf.scan(fn, L_flat, initializer=K.zeros( (self.nb_actions, self.nb_actions))) else: raise RuntimeError('Unknown Keras backend "{}".'.format( K.backend())) assert P is not None assert K.ndim(P) == 3 # Combine a, mu and P into a scalar (over the batches). What we compute here is # -.5 * (a - mu)^T * P * (a - mu), where * denotes the dot-product. Unfortunately # TensorFlow handles vector * P slightly suboptimal, hence we convert the vectors to # 1xd/dx1 matrices and finally flatten the resulting 1x1 matrix into a scalar. All # operations happen over the batch size, which is dimension 0. prod = K.batch_dot(K.expand_dims(a - mu, 1), P) prod = K.batch_dot(prod, K.expand_dims(a - mu, -1)) A = -.5 * K.batch_flatten(prod) assert K.ndim(A) == 2 return A
def backend_dot(x): return K.batch_dot(x[0], x[1])
def test(): ## argument train_path = sys.argv[1] test_path = sys.argv[2] predict_path = sys.argv[3] model_name = sys.argv[4] char_embed_path = sys.argv[5] word_embed_path = sys.argv[6] pos_embed_path = sys.argv[7] dict_path = sys.argv[8] train_rate = 0.9 max_char_ctx_len = 1160 max_word_ctx_len = 680 char_ctx_len = 1160 char_qus_len = 240 word_ctx_len = 400 word_qus_len = 40 word_char_len = 5 char_embed_size = 128 word_embed_size = 128 pos_embed_size = 32 hidden_size = 64 model_size = 64 max_epochs = 50 batch_size = 8 lr = 0.001 drop_rate = 0.5 recur_drop_rate = 0.0 patience = 20 ## load data print("load data") st = time.time() train_raw_data = data_utils.load_json_data(train_path) test_raw_data = data_utils.load_json_data(test_path) # # load pos data # train_gen_pos_data = data_utils.load_json_data(train_pos_path) # test_gen_pos_data = data_utils.load_json_data(test_pos_path) # load embedding char_embedding = word2vec.Word2Vec.load(char_embed_path) word_embedding = word2vec.Word2Vec.load(word_embed_path) pos_embedding = word2vec.Word2Vec.load(pos_embed_path) et = time.time() print("cost time:", et - st) ## process data print("process data") st = time.time() train_data = data_utils.make_train_data( train_raw_data ) # data format: (id, context, question, answer_start, answer_end) test_data = data_utils.make_test_data( test_raw_data) # data format: (id, context, question) train_context = [data[1] for data in train_data] train_question = [data[2] for data in train_data] train_char_answer_start = [data[3] for data in train_data] train_char_answer_end = [data[4] for data in train_data] # train_context_poss = [data['context'] for data in train_gen_pos_data['data']] # train_question_poss = [data['question'] for data in train_gen_pos_data['data']] test_id = [data[0] for data in test_data] test_context = [data[1] for data in test_data] test_question = [data[2] for data in test_data] # test_context_poss = [data['context'] for data in test_gen_pos_data['data']] # test_question_poss = [data['question'] for data in test_gen_pos_data['data']] del train_data del test_data et = time.time() print("cost time:", et - st) ## load vocabulary print("load vocabulary") st = time.time() char_vocab = data_utils.load_json_data('model_%s_char_vocab.json' % model_name) word_vocab = data_utils.load_json_data('model_%s_word_vocab.json' % model_name) pos_vocab = data_utils.load_json_data('model_%s_pos_vocab.json' % model_name) # poss = train_context_poss + train_question_poss + test_context_poss + test_question_poss # pos_vocab, rev_pos_vocab = data_utils.build_vocabulary_with_embedding(poss, pos_embedding) char_vocab_size = len(char_vocab) word_vocab_size = len(word_vocab) pos_vocab_size = len(pos_vocab) et = time.time() print("char vocab size:", char_vocab_size) print("word vocab size:", word_vocab_size) print("pos vocab size:", pos_vocab_size) print("cost time:", et - st) ## tokenize data print("tokenize data") st = time.time() train_context_chars = data_utils.tokenize_to_chars(train_context) train_question_chars = data_utils.tokenize_to_chars(train_question) test_context_chars = data_utils.tokenize_to_chars(test_context) test_question_chars = data_utils.tokenize_to_chars(test_question) train_context_words = data_utils.tokenize_to_words(train_context, init_dict=True, dict_path=dict_path) train_question_words = data_utils.tokenize_to_words(train_question, init_dict=True, dict_path=dict_path) test_context_words = data_utils.tokenize_to_words(test_context, init_dict=True, dict_path=dict_path) test_question_words = data_utils.tokenize_to_words(test_question, init_dict=True, dict_path=dict_path) train_context_poss = data_utils.tokenize_to_poss(train_context, init_dict=True, dict_path=dict_path) train_question_poss = data_utils.tokenize_to_poss(train_question, init_dict=True, dict_path=dict_path) test_context_poss = data_utils.tokenize_to_poss(test_context, init_dict=True, dict_path=dict_path) test_question_poss = data_utils.tokenize_to_poss(test_question, init_dict=True, dict_path=dict_path) et = time.time() print("cost time:", et - st) ## select data # select the data which sequence lengths satisfy length constraints print("select data") st = time.time() select_indices = data_utils.select_data_by_lengths(train_context_words, train_question_words, word_ctx_len, word_qus_len) train_context_chars = [train_context_chars[i] for i in select_indices] train_context_words = [train_context_words[i] for i in select_indices] train_context_poss = [train_context_poss[i] for i in select_indices] train_question_chars = [train_question_chars[i] for i in select_indices] train_question_words = [train_question_words[i] for i in select_indices] train_question_poss = [train_question_poss[i] for i in select_indices] train_char_answer_start = [ train_char_answer_start[i] for i in select_indices ] train_char_answer_end = [train_char_answer_end[i] for i in select_indices] et = time.time() print("cost time:", et - st) ## set answer # it should be done after tokenize sentences to words print("set answer") st = time.time() train_word_answer_start, train_word_answer_end = data_utils.set_word_answer( train_context_words, train_char_answer_start, train_char_answer_end, word_ctx_len) train_answer_start, train_answer_end = train_word_answer_start, train_word_answer_end et = time.time() print("cost time:", et - st) ## pad data print("pad data") st = time.time() # clip words to chars # it should be done after build vocab (add PAD) train_context_clip_chars = data_utils.clip_words_to_chars( train_context_words, word_char_len) train_question_clip_chars = data_utils.clip_words_to_chars( train_question_words, word_char_len) test_context_clip_chars = data_utils.clip_words_to_chars( test_context_words, word_char_len) test_question_clip_chars = data_utils.clip_words_to_chars( test_question_words, word_char_len) # print("Debug: tarin_context_clip_chars[0]:") # print(train_context_clip_chars[0]) # print("Debug: train_question_clip_chars[0]:") # print(train_question_clip_chars[0]) # padding train_context_pad_chars = data_utils.pad_sequences( train_context_clip_chars, word_ctx_len * word_char_len) train_question_pad_chars = data_utils.pad_sequences( train_question_clip_chars, word_qus_len * word_char_len) train_context_pad_words = data_utils.pad_sequences(train_context_words, word_ctx_len) train_question_pad_words = data_utils.pad_sequences( train_question_words, word_qus_len) train_context_pad_poss = data_utils.pad_sequences(train_context_poss, word_ctx_len) train_question_pad_poss = data_utils.pad_sequences(train_question_poss, word_qus_len) test_context_pad_chars = data_utils.pad_sequences( test_context_clip_chars, word_ctx_len * word_char_len) test_question_pad_chars = data_utils.pad_sequences( test_question_clip_chars, word_qus_len * word_char_len) test_context_pad_words = data_utils.pad_sequences(test_context_words, word_ctx_len) test_question_pad_words = data_utils.pad_sequences(test_question_words, word_qus_len) test_context_pad_poss = data_utils.pad_sequences(test_context_poss, word_ctx_len) test_question_pad_poss = data_utils.pad_sequences(test_question_poss, word_qus_len) et = time.time() print("cost time:", et - st) ## make arrays print("make arrays") st = time.time() # map vocab to index # print("Debug: train_context_pad_words[0]:") # print(train_context_pad_words[0]) # print("Debug: train_question_pad_words[0]:") # print(train_question_pad_words[0]) train_context_char_indices = data_utils.map_vocabulary_index( train_context_pad_chars, char_vocab) train_question_char_indices = data_utils.map_vocabulary_index( train_question_pad_chars, char_vocab) train_context_word_indices = data_utils.map_vocabulary_index( train_context_pad_words, word_vocab) train_question_word_indices = data_utils.map_vocabulary_index( train_question_pad_words, word_vocab) train_context_pos_indices = data_utils.map_vocabulary_index( train_context_pad_poss, pos_vocab) train_question_pos_indices = data_utils.map_vocabulary_index( train_question_pad_poss, pos_vocab) test_context_char_indices = data_utils.map_vocabulary_index( test_context_pad_chars, char_vocab) test_question_char_indices = data_utils.map_vocabulary_index( test_question_pad_chars, char_vocab) test_context_word_indices = data_utils.map_vocabulary_index( test_context_pad_words, word_vocab) test_question_word_indices = data_utils.map_vocabulary_index( test_question_pad_words, word_vocab) test_context_pos_indices = data_utils.map_vocabulary_index( test_context_pad_poss, pos_vocab) test_question_pos_indices = data_utils.map_vocabulary_index( test_question_pad_poss, pos_vocab) # make one-hot label train_answer_start_onehot = data_utils.one_hot_encoding( train_answer_start, word_ctx_len) train_answer_end_onehot = data_utils.one_hot_encoding( train_answer_end, word_ctx_len) # to array # X1: context chars; X2: context words; X3: context poss; # X4: question chars; X5: question words; X6: question poss; # Y1: answer_start, Y2: answer_end train_X1 = np.array(train_context_char_indices, dtype=np.int32) train_X2 = np.array(train_context_word_indices, dtype=np.int32) train_X3 = np.array(train_context_pos_indices, dtype=np.int32) train_X4 = np.array(train_question_char_indices, dtype=np.int32) train_X5 = np.array(train_question_word_indices, dtype=np.int32) train_X6 = np.array(train_question_pos_indices, dtype=np.int32) train_Y1 = np.array(train_answer_start_onehot, dtype=np.int32) train_Y2 = np.array(train_answer_end_onehot, dtype=np.int32) train_word_ans1 = np.array(train_answer_start, dtype=np.int32) train_word_ans2 = np.array(train_answer_end, dtype=np.int32) train_ans1 = np.array(train_char_answer_start, dtype=np.int32) train_ans2 = np.array(train_char_answer_end, dtype=np.int32) test_X1 = np.array(test_context_char_indices, dtype=np.int32) test_X2 = np.array(test_context_word_indices, dtype=np.int32) test_X3 = np.array(test_context_pos_indices, dtype=np.int32) test_X4 = np.array(test_question_char_indices, dtype=np.int32) test_X5 = np.array(test_question_word_indices, dtype=np.int32) test_X6 = np.array(test_question_pos_indices, dtype=np.int32) # make embedding weight matrix word_embed_matrix = data_utils.make_embedding_matrix( word_embedding, word_vocab, word_embed_size) char_embed_matrix = data_utils.make_embedding_matrix( char_embedding, char_vocab, char_embed_size) pos_embed_matrix = data_utils.make_embedding_matrix( pos_embedding, pos_vocab, pos_embed_size) # delete data for releasing memory del train_context, train_question, test_context, test_question del train_context_chars, train_question_chars, test_context_chars, test_question_chars # del train_context_words, train_question_words, test_context_words, test_question_words del train_context_clip_chars, train_question_clip_chars, test_context_clip_chars, test_question_clip_chars del train_context_char_indices, train_question_char_indices, test_context_char_indices, test_question_char_indices del train_context_word_indices, train_question_word_indices, test_context_word_indices, test_question_word_indices del train_context_pos_indices, train_question_pos_indices, test_context_pos_indices, test_question_pos_indices del train_word_answer_start, train_word_answer_end, train_char_answer_start, train_char_answer_end del train_answer_start_onehot, train_answer_end_onehot et = time.time() print("train shape:", train_X1.shape, train_X2.shape, train_X3.shape, train_X4.shape, train_X5.shape, train_X6.shape, train_Y1.shape, train_Y2.shape) print("test shape:", test_X1.shape, test_X2.shape, test_X3.shape, test_X4.shape, test_X5.shape, test_X6.shape) print("cost time:", et - st) ## XXX build model print("build model") st = time.time() # input layers # X1: context chars; X2: context words; X3: context poss; # X4: question chars; X5: question words; X6: question poss; # Y1: answer_start; Y2: answer_end var_x1_input = Input(shape=(word_ctx_len * word_char_len, ), dtype=np.int32) var_x2_input = Input(shape=(word_ctx_len, ), dtype=np.int32) var_x3_input = Input(shape=(word_ctx_len, ), dtype=np.int32) var_x4_input = Input(shape=(word_qus_len * word_char_len, ), dtype=np.int32) var_x5_input = Input(shape=(word_qus_len, ), dtype=np.int32) var_x6_input = Input(shape=(word_qus_len, ), dtype=np.int32) # embedding layers var_x1_embed = Embedding( input_dim=char_vocab_size, output_dim=char_embed_size, weights=[char_embed_matrix], input_length=word_ctx_len * word_char_len, trainable=False )(var_x1_input) # shape: (None, ctx_length * word_length, char_embed_size) var_x2_embed = Embedding( input_dim=word_vocab_size, output_dim=word_embed_size, weights=[word_embed_matrix], input_length=word_ctx_len, trainable=False)( var_x2_input) # shape: (None, ctx_length, word_embed_size) var_x3_embed = Embedding( input_dim=pos_vocab_size, output_dim=pos_embed_size, weights=[pos_embed_matrix], input_length=word_ctx_len, trainable=False)( var_x3_input) # shape: (None, ctx_length, pos_embed_size) var_x4_embed = Embedding( input_dim=char_vocab_size, output_dim=char_embed_size, weights=[char_embed_matrix], input_length=word_qus_len * word_char_len, trainable=False )(var_x4_input) # shape: (None, qus_length * word_length, char_embed_size) var_x5_embed = Embedding( input_dim=word_vocab_size, output_dim=word_embed_size, weights=[word_embed_matrix], input_length=word_qus_len, trainable=False)( var_x5_input) # shape: (None, qus_length, word_embed_size) var_x6_embed = Embedding( input_dim=pos_vocab_size, output_dim=pos_embed_size, weights=[pos_embed_matrix], input_length=word_qus_len, trainable=False)( var_x6_input) # shape: (None, qus_length, pos_embed_size) var_x1_embed = Reshape([word_ctx_len, word_char_len * char_embed_size])( var_x1_embed ) # shape: (None, ctx_length, word_length * char_embed_size) var_x4_embed = Reshape([word_qus_len, word_char_len * char_embed_size])( var_x4_embed ) # shape: (None, qus_length, word_length * char_embed_size) var_char_embed_layer = Dense(units=word_embed_size) var_x1_embed = TimeDistributed( var_char_embed_layer, input_shape=(word_ctx_len, word_char_len * char_embed_size))( var_x1_embed) # shape: (None, ctx_length, word_embed_size) var_x1_embed = Activation('relu')(var_x1_embed) # var_x1_embed = Dropout(rate=drop_rate)(var_x1_embed) var_x4_embed = TimeDistributed( var_char_embed_layer, input_shape=(word_qus_len, word_char_len * char_embed_size))( var_x4_embed) # shape: (None, qus_length, word_embed_size) var_x4_embed = Activation('relu')(var_x4_embed) # var_x4_embed = Dropout(rate=drop_rate)(var_x4_embed) #XXX concatenate word embedding and pos embedding directly var_ctx_embed = concatenate( [var_x1_embed, var_x2_embed, var_x3_embed], axis=2 ) # shape: (None, ctx_length, word_embed_size * 2 + pos_embed_size) var_qus_embed = concatenate( [var_x4_embed, var_x5_embed, var_x6_embed], axis=2 ) # shape: (None, qus_length, word_embed_size * 2 + pos_embed_size) var_ctx_embed = Dropout(rate=drop_rate)(var_ctx_embed) var_qus_embed = Dropout(rate=drop_rate)(var_qus_embed) var_ctx_lstm = Bidirectional( LSTM(units=hidden_size, recurrent_dropout=recur_drop_rate, return_sequences=True))( var_ctx_embed) # shape: (None, ctx_length, hidden_size * 2) var_qus_lstm = Bidirectional( LSTM(units=hidden_size, recurrent_dropout=recur_drop_rate, return_sequences=True))( var_qus_embed) # shape: (None, qus_length, hidden_size * 2) # dropout ? # var_ctx_lstm = Dropout(rate=drop_rate)(var_ctx_lstm) # var_qus_lstm = Dropout(rate=drop_rate)(var_qus_lstm) # attention layers var_ctx_flatten = Flatten()( var_ctx_lstm) # shape: (None, ctx_length * hidden_size * 2) var_qus_flatten = Flatten()( var_qus_lstm) # shape: (None, qus_length * hidden_size * 2) var_ctx_repeat = RepeatVector(word_qus_len)( var_ctx_flatten ) # shape: (None, qus_length, ctx_length * hidden_size * 2) var_qus_repeat = RepeatVector(word_ctx_len)( var_qus_flatten ) # shape: (None, ctx_length, qus_length * hidden_size * 2) var_ctx_repeat = Reshape([word_qus_len, word_ctx_len, hidden_size * 2])( var_ctx_repeat ) # shape: (None, qus_length, ctx_length, hidden_size * 2) var_qus_repeat = Reshape([word_ctx_len, word_qus_len, hidden_size * 2])( var_qus_repeat ) # shape: (None, ctx_length, qus_length, hidden_size * 2) var_ctx_repeat = Permute( [2, 1, 3])(var_ctx_repeat ) # shape: (None, ctx_length, qus_length, hidden_size * 2) var_mul_repeat = multiply([ var_ctx_repeat, var_qus_repeat ]) # shape: (None, ctx_length, qus_length, hidden_size * 2) var_sim_repeat = concatenate( [var_ctx_repeat, var_qus_repeat, var_mul_repeat], axis=3) # shape: (None, ctx_length, qus_length, hidden_size * 6) var_sim_sequence = Reshape([word_ctx_len * word_qus_len, hidden_size * 6])( var_sim_repeat ) # shape: (None, ctx_length * qus_length, hidden_size * 6) # dropout ? # var_sim_sequence = Dropout(rate=drop_rate)(var_sim_sequence) var_similarity = TimeDistributed( Dense(units=1), input_shape=(word_ctx_len * word_qus_len, hidden_size * 6))( var_sim_sequence) # shape: (None, ctx_length * qus_length, 1) var_similarity = Reshape([word_ctx_len, word_qus_len])( var_similarity) # shape: (None, ctx_length, qus_length) var_similarity = Activation('relu')(var_similarity) # dropout ? # var_similarity = Dropout(rate=drop_rate)(var_similarity) var_c2qatt_weight = TimeDistributed( Activation('softmax'), input_shape=(word_ctx_len, word_qus_len))( var_similarity) # shape: (None, ctx_length, qus_length) var_c2qatt_ctx = Lambda(lambda x: K.batch_dot(x[0], x[1], axes=[2, 1]))( [var_c2qatt_weight, var_qus_lstm]) # shape: (None, ctx_length, hidden_size * 2) var_q2catt_weight = Lambda(lambda x: K.max(x, axis=2))( var_similarity) # shape: (None, ctx_length) var_q2catt_weight = RepeatVector(hidden_size * 2)( var_q2catt_weight) # shape: (None, hidden_size * 2, ctx_length) var_q2catt_weight = Permute([2, 1])( var_q2catt_weight) # shape: (None, ctx_length, hidden_size * 2) var_q2catt_ctx = multiply([var_q2catt_weight, var_ctx_lstm ]) # shape: (None, ctx_length, hidden_size * 2) var_c2qctx_attmul = multiply( [var_ctx_lstm, var_c2qatt_ctx]) # shape: (None, ctx_length, hidden_size * 2) var_q2cctx_attmul = multiply( [var_ctx_lstm, var_q2catt_ctx]) # shape: (None, ctx_length, hidden_size * 2) var_attention = concatenate( [var_ctx_lstm, var_c2qatt_ctx, var_c2qctx_attmul, var_q2cctx_attmul], axis=2) # shape: (None, ctx_length, hidden_size * 8) var_attention = Activation('relu')(var_attention) # # dropout ? # var_attention = Dropout(rate=drop_rate)(var_attention) # model layers var_model1_lstm = Bidirectional( LSTM(units=model_size, recurrent_dropout=recur_drop_rate, return_sequences=True))( var_attention) # shape: (None, ctx_length, model_size * 2) var_model1_att = concatenate( [var_attention, var_model1_lstm], axis=2) # shape: (None, ctx_length, hidden_size * 8 + model_size * 2) # dropout ? # var_model1_att = Dropout(rate=drop_rate)(var_model1_att) var_model2_lstm = Bidirectional( LSTM(units=model_size, recurrent_dropout=recur_drop_rate, return_sequences=True))( var_model1_lstm) # shape: (None, ctx_length, model_size * 2) var_model2_att = concatenate( [var_attention, var_model2_lstm], axis=2) # shape: (None, ctx_length, hidden_size * 8 + model_size * 2) # dropout ? # var_model2_att = Dropout(rate=drop_rate)(var_model2_att) # output layers var_pointer1_weight = TimeDistributed( Dense(units=1), input_shape=(word_ctx_len, hidden_size * 8 + model_size * 2))( var_model1_att) # shape: (None, ctx_length, 1) var_pointer1_weight = Flatten()( var_pointer1_weight) # shape: (None, ctx_length) var_pointer1 = Activation('softmax')( var_pointer1_weight) # shape: (None, ctx_length) var_pointer2_weight = TimeDistributed( Dense(units=1), input_shape=(word_ctx_len, hidden_size * 8 + model_size * 2))( var_model2_att) # shape: (None, ctx_length, 1) var_pointer2_weight = Flatten()( var_pointer2_weight) # shape: (None, ctx_length) var_pointer2 = Activation('softmax')( var_pointer2_weight) # shape: (None, ctx_length) model = Model(inputs=[ var_x1_input, var_x2_input, var_x3_input, var_x4_input, var_x5_input, var_x6_input ], outputs=[var_pointer1, var_pointer2]) adam = Adam(lr=lr) # # Set loss functions ? # def two_pointers_crossentropy(y_true, y_pred): # p1_true, p1_pred = y_true[0], y_pred[0] # p2_true, p2_pred = y_true[:,1], y_pred[1] # p1_loss = categorical_crops # XXX use multiple loss model.compile( optimizer=adam, loss=['categorical_crossentropy', 'categorical_crossentropy'], loss_weights=[0.5, 0.5], metrics=['accuracy']) et = time.time() print("cost time:", et - st) ## evaluate print("evaluate") st = time.time() model = load_model('model_%s.h5' % model_name, custom_objects={'tf': tf}) # compute predict print("predict") st = time.time() train_Y1_hat, train_Y2_hat = model.predict( [train_X1, train_X2, train_X3, train_X4, train_X5, train_X6], batch_size=batch_size) et = time.time() print("cost time:", et - st) train_Y1_word_pred, train_Y2_word_pred = model_utils.constraint_predict( train_Y1_hat, train_Y2_hat) train_Y1_pred, train_Y2_pred = data_utils.set_char_answer( train_context_words, train_Y1_word_pred, train_Y2_word_pred) train_Y1_pred = np.array(train_Y1_pred, dtype=np.int32) train_Y2_pred = np.array(train_Y2_pred, dtype=np.int32) # evaluate predict with setting answer (word answer) train_acc1, train_acc2, train_accuracy = evaluation.compute_accuracy( train_word_ans1, train_Y1_word_pred, train_word_ans2, train_Y2_word_pred) train_prec, train_rec, train_f1 = evaluation.compute_scores( train_word_ans1, train_Y1_word_pred, train_word_ans2, train_Y2_word_pred, word_ctx_len) print("word-level train accuracy:", train_acc1, train_acc2, train_accuracy) print("word-level train prec rec:", train_prec, train_rec) print("word-level train f1:", train_f1) # evaluate predict with real answer (char answer) train_acc1, train_acc2, train_accuracy = evaluation.compute_accuracy( train_ans1, train_Y1_pred, train_ans2, train_Y2_pred) train_prec, train_rec, train_f1 = evaluation.compute_scores( train_ans1, train_Y1_pred, train_ans2, train_Y2_pred, max_char_ctx_len) print("char-level train accuracy:", train_acc1, train_acc2, train_accuracy) print("char-level train prec rec:", train_prec, train_rec) print("char-level train f1:", train_f1) et = time.time() print("cost time:", et - st) ## test print("test") st = time.time() test_Y1_hat, test_Y2_hat = model.predict( [test_X1, test_X2, test_X3, test_X4, test_X5, test_X6], batch_size=batch_size) # compute predict test_Y1_word_pred, test_Y2_word_pred = model_utils.constraint_predict( test_Y1_hat, test_Y2_hat) test_Y1_pred, test_Y2_pred = data_utils.set_char_answer( test_context_words, test_Y1_word_pred, test_Y2_word_pred) test_Y1_pred = np.array(test_Y1_pred, dtype=np.int32) test_Y2_pred = np.array(test_Y2_pred, dtype=np.int32) data_utils.write_predict(predict_path, test_id, test_Y1_pred, test_Y2_pred) et = time.time() print("cost time:", et - st)
vec[0][0] = 1 In = [] for j in range(n_data): In.append(Input(shape=[len_feature])) In.append(Input(shape=(neighbors, n_data))) In.append(Input(shape=(1, neighbors))) feature = [] for j in range(n_data): feature.append(encoder(In[j * 2])) feature_ = Concatenate(axis=1)(feature) relation1 = [] for j in range(n_data): T = Lambda(lambda x: K.batch_dot(x[0], x[1]))([In[j * 2 + 1], feature_]) relation1.append(m1([T, T, T, In[n_data * 2]])) relation1_ = Concatenate(axis=1)(relation1) relation2 = [] for j in range(n_data): T = Lambda(lambda x: K.batch_dot(x[0], x[1]))([In[j * 2 + 1], relation1_]) relation2.append(m2([T, T, T, In[n_data * 2]])) V = [] for j in range(n_data): V.append(q_net([feature[j], relation1[j], relation2[j]])) model = Model(input=In, output=V) model.compile(optimizer=Adam(lr=0.0001), loss='mse')
def gram(cnn): # gram3 = [] gram = K.batch_dot(cnn, cnn, axes=[3, 3]) gram = K.reshape(gram, (-1, 2500)) return gram
def call(self, inputs): q, a = inputs # https://github.com/wglassly/cnnormaliztion/blob/master/src/nn_layers.py#L822 return K.batch_dot(q, K.dot(a, K.transpose(self.M)), axes=1)
def gram_matrix_b(x): x = K.permute_dimensions(x, (0, 3, 1, 2)) s = K.shape(x) feat = K.reshape(x, (s[0], s[1], s[2] * s[3])) return K.batch_dot(feat, K.permute_dimensions( feat, (0, 2, 1))) / K.prod(K.cast(s[1:], K.floatx()))
def dot_funxtion(x): return K.batch_dot(x[0], x[1])
def _g_logvar_chol_3D(p, epsilon): mu, logvar, chol = p epsilon = K.batch_dot(epsilon * K.exp(logvar/2), chol, axes=(2, 1)) return mu + epsilon
def attention(self, pre_q, pre_v, pre_k, out_seq_len, d_model, attn_mask=None, training=None): """ Calculates the output of the attention once the affine transformations of the inputs are done. Here's the shapes of the arguments: :param pre_q: (batch_size, q_seq_len, num_heads, d_model // num_heads) :param pre_v: (batch_size, v_seq_len, num_heads, d_model // num_heads) :param pre_k: (batch_size, k_seq_len, num_heads, d_model // num_heads) :param out_seq_len: the length of the output sequence :param d_model: dimensionality of the model (by the paper) :param training: Passed by Keras. Should not be defined manually. Optional scalar tensor indicating if we're in training or inference phase. """ # shaping Q and V into (batch_size, num_heads, seq_len, d_model//heads) q = K.permute_dimensions(pre_q, [0, 2, 1, 3]) v = K.permute_dimensions(pre_v, [0, 2, 1, 3]) if self.compression_window_size is None: k_transposed = K.permute_dimensions(pre_k, [0, 2, 3, 1]) else: # Memory-compressed attention described in paper # "Generating Wikipedia by Summarizing Long Sequences" # (https://arxiv.org/pdf/1801.10198.pdf) # It compresses keys and values using 1D-convolution which reduces # the size of Q * K_transposed from roughly seq_len^2 # to convoluted_seq_len^2. If we use strided convolution with # window size = 3 and stride = 3, memory requirements of such # memory-compressed attention will be 9 times smaller than # that of the original version. if self.use_masking: raise NotImplementedError( "Masked memory-compressed attention has not " "been implemented yet") k = K.permute_dimensions(pre_k, [0, 2, 1, 3]) k, v = [ K.reshape( # Step 3: Return the result to its original dimensions # (batch_size, num_heads, seq_len, d_model//heads) K.bias_add( # Step 3: ... and add bias K.conv1d( # Step 2: we "compress" K and V using strided conv K.reshape( # Step 1: we reshape K and V to # (batch * num_heads, seq_len, d_model//heads) item, (-1, K.int_shape(item)[-2], d_model // self.num_heads)), kernel, strides=self.compression_window_size, padding='valid', data_format='channels_last'), bias, data_format='channels_last'), # new shape K.concatenate([ K.shape(item)[0], K.shape(item)[1], # shape: (batch_size, num_heads) [-1, d_model // self.num_heads] ])) # shape: (seq_len, n_model//num_heads) for item, kernel, bias in ((k, self.k_conv_kernel, self.k_conv_bias), (v, self.v_conv_kernel, self.v_conv_bias)) ] k_transposed = K.permute_dimensions(k, [0, 1, 3, 2]) # shaping K into (batch_size, num_heads, d_model//heads, seq_len) # for further matrix multiplication sqrt_d = K.sqrt(K.cast(d_model, dtype=K.floatx()) // self.num_heads) q_shape = K.shape(q) k_t_shape = K.shape(k_transposed) v_shape = K.shape(v) #q_shape = K.int_shape(q) #k_t_shape = K.int_shape(k_transposed) #v_shape = K.int_shape(v) # before performing batch_dot all tensors are being converted to 3D # shape (batch_size * num_heads, tar_seq_len, d_model//num_heads) to make sure batch_dot # performs identically on all backends attention_heads = K.reshape( K.batch_dot( self.apply_dropout_if_needed( K.softmax( # mask the attention for the prediction process #self.mask_attention_if_needed( self.mask_attention( # core scaled dot product K. batch_dot( # (batch_size * num_heads, tar_seq_len, src_seq_len) K.reshape( q, (-1, q_shape[-2], q_shape[-1]) ), # q_shape: (batch_size*num_heads, q_seq_len, d_model//heads) K.reshape( k_transposed, # k_transposed: (batch_size*num_heads, d_model//heads, k_seq_len) (-1, k_t_shape[-2], k_t_shape[-1]))) / sqrt_d, attn_mask)), training=training), K.reshape(v, (-1, v_shape[-2], v_shape[-1])) ), # shape: (batch_size * num_heads, v_seq_len, d_model//heads) (-1, self.num_heads, q_shape[-2], q_shape[-1])) # shape: (batch_size * seq_length, d_model) attention_heads_merged = K.reshape( # shape (batch_size, q_seq_length, num_heads, d_model // num_heads) to make sure batch_dot K.permute_dimensions(attention_heads, [0, 2, 1, 3]), (-1, d_model)) # shape: (batch_size, out_seq_len, d_model). Generally, out_seq_len should be q_seq_len attention_out = K.reshape( K.dot(attention_heads_merged, self.output_weights), (-1, out_seq_len, d_model)) return attention_out
def _g_std_chol(p, epsilon): mu, s, chol = p epsilon = K.batch_dot(epsilon, chol, axes=(1, 2)) return mu + K.abs(s) * epsilon
def build(self, embedding_matrix): if self.config['rnn'] == 'gru' and self.config['gpu']: RNN = CuDNNGRU(self.config['rnn_output_size'], return_sequences=True) elif self.config['rnn'] == 'lstm' and self.config['gpu']: RNN = CuDNNLSTM(self.config['rnn_output_size'], return_sequences=True) elif self.config['rnn'] == 'gru' and not self.config['gpu']: RNN = GRU(self.config['rnn_output_size'], return_sequences=True, dropout=self.config['dropout_rate'], recurrent_dropout=self.config['dropout_rate']) else: RNN = LSTM(self.config['rnn_output_size'], return_sequences=True, dropout=self.config['dropout_rate'], recurrent_dropout=self.config['dropout_rate']) self.sentence_input = Input(shape=(self.config['max_length'], ), dtype='int32', name='sentence_input') embed = Embedding(embedding_matrix.shape[0], embedding_matrix.shape[1], trainable=self.config['embed_trainable'], weights=[embedding_matrix])(self.sentence_input) embed = SpatialDropout1D(self.config['spatial_dropout_rate'])(embed) convs = [] for ksz in self.config['kernel_sizes']: conv = Conv1D(self.config['filters'], ksz, activation='relu', padding='same')(embed) convs.append(conv) cnn_out = concatenate(convs, axis=-1) if self.config['bidirectional']: rnn_out = Bidirectional(RNN)(embed) else: rnn_out = RNN(embed) capsule_cnn = Capsule(num_capsule=self.config['num_capsule'], dim_capsule=self.config['dim_capsule'], routings=self.config['routings'], share_weights=True, name='capsule_cnn')(cnn_out) capsule_cnn = Flatten()(capsule_cnn) capsule_rnn = Capsule(num_capsule=self.config['num_capsule'], dim_capsule=self.config['dim_capsule'], routings=self.config['routings'], share_weights=True, name='capsule_rnn')(rnn_out) capsule_rnn = Flatten()(capsule_rnn) cnn_u = TimeDistributed( Dense(self.config['hidden_dims'], activation='tanh', use_bias=True))(cnn_out) cnn_alpha = Dense(1)(cnn_u) cnn_alpha = Flatten()(cnn_alpha) cnn_alpha = Activation(activation='softmax')(cnn_alpha) cnn_att_rep = Lambda(lambda x: K.batch_dot(x[0], x[1], axes=[1, 1]))( [cnn_out, cnn_alpha]) rnn_u = TimeDistributed( Dense(self.config['hidden_dims'], activation='tanh', use_bias=True))(rnn_out) rnn_alpha = Dense(1)(rnn_u) rnn_alpha = Flatten()(rnn_alpha) rnn_alpha = Activation(activation='softmax')(rnn_alpha) rnn_att_rep = Lambda(lambda x: K.batch_dot(x[0], x[1], axes=[1, 1]))( [rnn_out, rnn_alpha]) cnn_concat = concatenate([capsule_cnn, cnn_att_rep], axis=-1) rnn_concat = concatenate([capsule_rnn, rnn_att_rep], axis=-1) rep = concatenate([cnn_concat, rnn_concat], axis=-1) return rep
def MultiHeadsAttModel(self, In_agent, In_neighbor, l=5, d=128, dv=16, dout=128, nv=8, suffix=-1): """ input:[bacth,agent,128] output: -hidden state: [batch,agent,32] -attention: [batch,agent,neighbor] """ """ agent repr """ print("In_agent.shape,In_neighbor.shape,l, d, dv, dout, nv", In_agent.shape, In_neighbor.shape, l, d, dv, dout, nv) #[batch,agent,dim]->[batch,agent,1,dim] agent_repr = Reshape((self.num_agents, 1, d))(In_agent) """ neighbor repr """ #[batch,agent,dim]->(reshape)[batch,1,agent,dim]->(tile)[batch,agent,agent,dim] neighbor_repr = RepeatVector3D(self.num_agents)(In_agent) print("neighbor_repr.shape", neighbor_repr.shape) #[batch,agent,neighbor,agent]x[batch,agent,agent,dim]->[batch,agent,neighbor,dim] neighbor_repr = Lambda(lambda x: K.batch_dot(x[0], x[1]))( [In_neighbor, neighbor_repr]) print("neighbor_repr.shape", neighbor_repr.shape) """ attention computation """ #multi-head #[batch,agent,1,dim]->[batch,agent,1,dv*nv] agent_repr_head = Dense(dv * nv, activation='relu', kernel_initializer='random_normal', name='agent_repr_%d' % suffix)(agent_repr) #[batch,agent,1,dv,nv]->[batch,agent,nv,1,dv] agent_repr_head = Reshape( (self.num_agents, 1, dv, nv))(agent_repr_head) agent_repr_head = Lambda(lambda x: K.permute_dimensions( x, (0, 1, 4, 2, 3)))(agent_repr_head) #agent_repr_head=Lambda(lambda x:K.permute_dimensions(K.reshape(x,(-1,self.num_agents,1,dv,nv)),(0,1,4,2,3)))(agent_repr_head) #[batch,agent,neighbor,dim]->[batch,agent,neighbor,dv*nv] neighbor_repr_head = Dense(dv * nv, activation='relu', kernel_initializer='random_normal', name='neighbor_repr_%d' % suffix)(neighbor_repr) #[batch,agent,neighbor,dv,nv]->[batch,agent,nv,neighbor,dv] print("DEBUG", neighbor_repr_head.shape) print("self.num_agents,self.num_neighbors,dv,nv", self.num_agents, self.num_neighbors, dv, nv) neighbor_repr_head = Reshape( (self.num_agents, self.num_neighbors, dv, nv))(neighbor_repr_head) neighbor_repr_head = Lambda(lambda x: K.permute_dimensions( x, (0, 1, 4, 2, 3)))(neighbor_repr_head) #neighbor_repr_head=Lambda(lambda x:K.permute_dimensions(K.reshape(x,(-1,self.num_agents,self.num_neighbors,dv,nv)),(0,1,4,2,3)))(neighbor_repr_head) #[batch,agent,nv,1,dv]x[batch,agent,nv,neighbor,dv]->[batch,agent,nv,1,neighbor] att = Lambda( lambda x: K.softmax(K.batch_dot(x[0], x[1], axes=[4, 4])))( [agent_repr_head, neighbor_repr_head]) #[batch,agent,nv,1,neighbor]->[batch,agent,nv,neighbor] att_record = Reshape((self.num_agents, nv, self.num_neighbors))(att) #self embedding again neighbor_hidden_repr_head = Dense(dv * nv, activation='relu', kernel_initializer='random_normal', name='neighbor_hidden_repr_%d' % suffix)(neighbor_repr) neighbor_hidden_repr_head = Reshape( (self.num_agents, self.num_neighbors, dv, nv))(neighbor_hidden_repr_head) neighbor_hidden_repr_head = Lambda(lambda x: K.permute_dimensions( x, (0, 1, 4, 2, 3)))(neighbor_hidden_repr_head) out = Lambda(lambda x: K.mean(K.batch_dot(x[0], x[1]), axis=2))( [att, neighbor_hidden_repr_head]) out = Reshape((self.num_agents, dv))(out) out = Dense(dout, activation="relu", kernel_initializer='random_normal', name='MLP_after_relation_%d' % suffix)(out) return out, att_record
def _outer(AB): att_ji = K.batch_dot(AB[1], K.permute_dimensions(AB[0], (0, 2, 1))) return K.permute_dimensions(att_ji,(0, 2, 1))
def step(self, x, states): ( h_p, h_v, # 0:parent, 1:traversal x_type, # 2:treetype(ins/sub,left/right); ints of size (B,). \in {0,1,2,3} B_U, B_W) = states # 3:Udropoutmask, 4:Wdropoutmask #### matrix x has all 4 x computations in it ## per move this_Wx = self.W_x[x_type] ## B, I, 4*O matrix_x = K.batch_dot(x * B_W[0], this_Wx) + self.b_x x_zp = matrix_x[:, :self.output_dim] x_rp = matrix_x[:, self.output_dim:2 * self.output_dim] x_rv = matrix_x[:, 2 * self.output_dim:3 * self.output_dim] x_ih = matrix_x[:, 3 * self.output_dim:] #### matrix p has zp, rp; matrix v has zv, rv matrix_p = K.dot(h_p * B_U[0], self.U_p[:, :2 * self.output_dim]) # zp is for the parent unit update (resulting in child unit) inner_zp = matrix_p[:, :self.output_dim] z_p = self.inner_activation(x_zp + inner_zp) # rp is for gating to the intermediate unit of parent inner_rp = matrix_p[:, self.output_dim:2 * self.output_dim] r_p = self.inner_activation(x_rp + inner_rp) matrix_v = K.dot(h_v * B_U[0], self.U_v[:, :2 * self.output_dim]) # rv is for the intermediate gate on the traversal unit # this gets reused for both the parent's and its own intermediate inner_rv = matrix_v[:, self.output_dim:2 * self.output_dim] r_v = self.inner_activation(x_rv + inner_rv) # the actual recurrence calculations # h_p * U and h_v * U ; as gated by their r gates inner_hp = K.dot(r_p * h_p * B_U[0], self.U_p[:, 2 * self.output_dim:]) inner_hv = K.dot(r_v * h_v * B_U[0], self.U_v[:, 2 * self.output_dim:]) # h_c_tilde is the intermediate state h_c_tilde = self.activation(x_ih + inner_hp + inner_hv) # h_c is the new child state h_c = z_p * h_c_tilde + (1 - z_p) * h_p matrix_c = K.dot(h_c * B_U[0], self.U_c) + self.b_c hc_zv = matrix_c[:, :self.output_dim] hc_rv = matrix_c[:, self.output_dim:2 * self.output_dim] hc_ih = matrix_c[:, 2 * self.output_dim:] ### zv -> gate h_v and h_v_tilde ### rv -> gate h_v's contribution to h_v_tilde ### ih -> h_c's contribution to h_v_tilde # zv is for the traversal unit update. inner_zv = matrix_v[:, :self.output_dim] z_v = self.inner_activation(hc_zv + inner_zv) ## r_v is calculated with h_c rather than x r_v = self.inner_activation(hc_rv + inner_rv) inner_hvplus = K.dot(r_v * h_v * B_U[0], self.U_v[:, 2 * self.output_dim:]) h_vplus_tilde = self.activation(hc_ih + inner_hvplus) h_vplus = z_v * h_v + (1 - z_v) * h_vplus_tilde return h_c, h_vplus
def call(self, X, mask=None): # input: D (sample,c,w,d) proj_input = self.activation(tf.tensordot(X, self.att_proj, axes=[[3],[0]])) # tanh(dot(D,P))=Dl,(sample,c,w,p) if self.context == 'word': raw_att_scores = tf.tensordot(proj_input, self.att_scorer, axes=[[3],[0]]) # (sample,c,w) elif self.context == 'clause': def step(X, states): new_state = activations.tanh(tf.tensordot(X,self.encoder_weight, axes=[[2],[0]]) \ + tf.tensordot(states[0],self.recurrent_weight, axes=[[2],[0]])) return new_state, [new_state] # Make all-zero initial state. # Directly obtaining the first input dimension is not allowed, so this is the work-aronud. initial_state = tf.tensordot(K.max(proj_input*0,axis=2),K.zeros((self.proj_dim, self.rec_hid_dim)), axes = [[2],[0]]) proj_input_permute = K.permute_dimensions(proj_input,(0,2,1,3)) _,all_rnn_out,_ = K.rnn(step,proj_input_permute,[initial_state]) raw_att_scores = tf.tensordot(K.permute_dimensions(all_rnn_out,(0,2,1,3)), self.att_scorer, axes=[[3],[0]]) elif self.context == 'bidirectional_clause': def step_forward(X, states): new_state = activations.tanh(tf.tensordot(X,self.encoder_weight_forward, axes=[[2],[0]]) \ + tf.tensordot(states[0],self.recurrent_weight_forward, axes=[[2],[0]])) return new_state, [new_state] def step_backward(X, states): new_state = activations.tanh(tf.tensordot(X,self.encoder_weight_backward, axes=[[2],[0]]) \ + tf.tensordot(states[0],self.recurrent_weight_backward, axes=[[2],[0]])) return new_state, [new_state] # Make all-zero initial state. # Directly obtaining the first input dimension is not allowed, so this is the work-aronud. initial_state = tf.tensordot(K.max(proj_input*0,axis=2),K.zeros((self.proj_dim, self.rec_hid_dim)), axes = [[2],[0]]) proj_input_permute = K.permute_dimensions(proj_input,(0,2,1,3)) proj_input_permute_backward = K.reverse(proj_input_permute, 1) _,all_rnn_out_forward,_ = K.rnn(step_forward,proj_input_permute,[initial_state]) _,all_rnn_out_backward,_ = K.rnn(step_backward,proj_input_permute,[initial_state]) all_rnn_out = all_rnn_out_forward+all_rnn_out_backward raw_att_scores = tf.tensordot(K.permute_dimensions(all_rnn_out,(0,2,1,3)), self.att_scorer, axes=[[3],[0]]) elif self.context == 'LSTM_clause': def step(inputs, states): h_tm1 = states[0] # previous memory state c_tm1 = states[1] # previous carry state x_i = tf.tensordot(inputs, self.kernel_i,axes=[[2],[0]]) x_f = tf.tensordot(inputs, self.kernel_f,axes=[[2],[0]]) x_c = tf.tensordot(inputs, self.kernel_c,axes=[[2],[0]]) x_o = tf.tensordot(inputs, self.kernel_o,axes=[[2],[0]]) x_i = K.bias_add(x_i, self.bias_i) x_f = K.bias_add(x_f, self.bias_f) x_c = K.bias_add(x_c, self.bias_c) x_o = K.bias_add(x_o, self.bias_o) i = activations.hard_sigmoid(x_i + tf.tensordot(h_tm1, self.recurrent_kernel_i,axes=[[2],[0]])) f = activations.hard_sigmoid(x_f + tf.tensordot(h_tm1, self.recurrent_kernel_f,axes=[[2],[0]])) c = f * c_tm1 + i * activations.tanh(x_c + tf.tensordot(h_tm1, self.recurrent_kernel_c,axes=[[2],[0]])) o = activations.hard_sigmoid(x_o + tf.tensordot(h_tm1, self.recurrent_kernel_o,axes=[[2],[0]])) h = o * activations.tanh(c) return h, [h, c] # Make all-zero initial state. # Directly obtaining the first input dimension is not allowed, so this is the work-aronud. initial_state = tf.tensordot(K.max(proj_input*0,axis=2),K.zeros((self.proj_dim, self.rec_hid_dim)), axes = [[2],[0]]) proj_input_permute = K.permute_dimensions(proj_input,(0,2,1,3)) _,all_rnn_out,_ = K.rnn(step,proj_input_permute,[initial_state,initial_state]) raw_att_scores = tf.tensordot(K.permute_dimensions(all_rnn_out,(0,2,1,3)), self.att_scorer, axes=[[3],[0]]) elif self.context == 'biLSTM_clause': def step_forward(inputs, states): h_tm1 = states[0] # previous memory state c_tm1 = states[1] # previous carry state x_i = tf.tensordot(inputs, self.kernel_i_forward,axes=[[2],[0]]) x_f = tf.tensordot(inputs, self.kernel_f_forward,axes=[[2],[0]]) x_c = tf.tensordot(inputs, self.kernel_c_forward,axes=[[2],[0]]) x_o = tf.tensordot(inputs, self.kernel_o_forward,axes=[[2],[0]]) x_i = K.bias_add(x_i, self.bias_i_forward) x_f = K.bias_add(x_f, self.bias_f_forward) x_c = K.bias_add(x_c, self.bias_c_forward) x_o = K.bias_add(x_o, self.bias_o_forward) i = activations.hard_sigmoid(x_i + tf.tensordot(h_tm1, self.recurrent_kernel_i_forward,axes=[[2],[0]])) f = activations.hard_sigmoid(x_f + tf.tensordot(h_tm1, self.recurrent_kernel_f_forward,axes=[[2],[0]])) c = f * c_tm1 + i * activations.tanh(x_c + tf.tensordot(h_tm1, self.recurrent_kernel_c_forward,axes=[[2],[0]])) o = activations.hard_sigmoid(x_o + tf.tensordot(h_tm1, self.recurrent_kernel_o_forward,axes=[[2],[0]])) h = o * activations.tanh(c) return h, [h, c] def step_backward(inputs, states): h_tm1 = states[0] # previous memory state c_tm1 = states[1] # previous carry state x_i = tf.tensordot(inputs, self.kernel_i_backward,axes=[[2],[0]]) x_f = tf.tensordot(inputs, self.kernel_f_backward,axes=[[2],[0]]) x_c = tf.tensordot(inputs, self.kernel_c_backward,axes=[[2],[0]]) x_o = tf.tensordot(inputs, self.kernel_o_backward,axes=[[2],[0]]) x_i = K.bias_add(x_i, self.bias_i_backward) x_f = K.bias_add(x_f, self.bias_f_backward) x_c = K.bias_add(x_c, self.bias_c_backward) x_o = K.bias_add(x_o, self.bias_o_backward) i = activations.hard_sigmoid(x_i + tf.tensordot(h_tm1, self.recurrent_kernel_i_backward,axes=[[2],[0]])) f = activations.hard_sigmoid(x_f + tf.tensordot(h_tm1, self.recurrent_kernel_f_backward,axes=[[2],[0]])) c = f * c_tm1 + i * activations.tanh(x_c + tf.tensordot(h_tm1, self.recurrent_kernel_c_backward,axes=[[2],[0]])) o = activations.hard_sigmoid(x_o + tf.tensordot(h_tm1, self.recurrent_kernel_o_backward,axes=[[2],[0]])) h = o * activations.tanh(c) return h, [h, c] # Make all-zero initial state. # Directly obtaining the first input dimension is not allowed, so this is the work-aronud. initial_state = tf.tensordot(K.max(proj_input*0,axis=2),K.zeros((self.proj_dim, self.rec_hid_dim)), axes = [[2],[0]]) proj_input_permute = K.permute_dimensions(proj_input,(0,2,1,3)) proj_input_permute_backward = K.reverse(proj_input_permute, 1) _,all_rnn_out_forward,_ = K.rnn(step_forward,proj_input_permute,[initial_state,initial_state]) _,all_rnn_out_backward,_ = K.rnn(step_backward,proj_input_permute_backward,[initial_state,initial_state]) all_rnn_out = K.concatenate([all_rnn_out_forward,all_rnn_out_backward],axis=-1) raw_att_scores = tf.tensordot(K.permute_dimensions(all_rnn_out,(0,2,1,3)), self.att_scorer, axes=[[3],[0]]) elif self.context == 'para': raw_att_scores = K.sum(tf.tensordot(proj_input, self.att_scorer, axes=[[3],[2]]), axis = [1, 2]) # (sample,c,w) if self.hard: # Hard attention rep_att_score = K.repeat_elements(K.expand_dims(raw_att_scores),rep=self.wd,axis=-1) top = tf.nn.top_k(K.permute_dimensions(rep_att_score,(0,1,3,2)),k=self.k).indices permute_X = K.permute_dimensions(X,(0,1,3,2)) reduced_X = K.permute_dimensions(tf.batch_gather(permute_X, top),(0,1,3,2)) new_att_scores = K.softmax(tf.nn.top_k(raw_att_scores,k=self.k).values,axis=2) result = K.batch_dot(new_att_scores,reduced_X,axes=[2,2]) else: att_scores = K.softmax(raw_att_scores, axis=2) result = K.batch_dot(att_scores,X,axes=[2,2]) # (sample,c,d) if self.return_attention: return [result, raw_att_scores] else: return result
def local_conv3d(self, inputs, kernel, kernel_size, strides, output_shape, data_format=None): """Apply 3D conv with un-shared weights. # Arguments inputs: 4D tensor with shape: (batch_size, filters, new_rows, new_cols) if data_format='channels_first' or 4D tensor with shape: (batch_size, new_rows, new_cols, filters) if data_format='channels_last'. kernel: the unshared weight for convolution, with shape (output_items, feature_dim, filters) kernel_size: a tuple of 2 integers, specifying the width and height of the 3D convolution window. strides: a tuple of 2 integers, specifying the strides of the convolution along the width and height. output_shape: a tuple with (output_row, output_col) data_format: the data format, channels_first or channels_last # Returns A 4d tensor with shape: (batch_size, filters, new_rows, new_cols) if data_format='channels_first' or 4D tensor with shape: (batch_size, new_rows, new_cols, filters) if data_format='channels_last'. # Raises ValueError: if `data_format` is neither `channels_last` or `channels_first`. """ if data_format is None: data_format = K.image_data_format() if data_format not in {'channels_first', 'channels_last'}: raise ValueError('Unknown data_format: ' + str(data_format)) stride_row, stride_col, stride_z = strides output_row, output_col, output_z = output_shape kernel_shape = K.int_shape(kernel) _, feature_dim, filters = kernel_shape xs = [] for i in range(output_row): for j in range(output_col): for k in range(output_z): slice_row = slice(i * stride_row, i * stride_row + kernel_size[0]) slice_col = slice(j * stride_col, j * stride_col + kernel_size[1]) slice_z = slice(k * stride_z, k * stride_z + kernel_size[2]) if data_format == 'channels_first': xs.append( K.reshape( inputs[:, :, slice_row, slice_col, slice_z], (1, -1, feature_dim))) else: xs.append( K.reshape( inputs[:, slice_row, slice_col, slice_z, :], (1, -1, feature_dim))) x_aggregate = K.concatenate(xs, axis=0) output = K.batch_dot(x_aggregate, kernel) output = K.reshape(output, (output_row, output_col, output_z, -1, filters)) if data_format == 'channels_first': output = K.permute_dimensions(output, (3, 4, 0, 1, 2)) else: output = K.permute_dimensions(output, (3, 0, 1, 2, 4)) return output
def __init__(self, model, bounds, channel_axis=3, preprocessing=(0, 1), predicts='probabilities'): super(KerasModel, self).__init__(bounds=bounds, channel_axis=channel_axis, preprocessing=preprocessing) from keras import backend as K import keras from pkg_resources import parse_version assert parse_version(keras.__version__) >= parse_version( '2.0.7'), 'Keras version needs to be 2.0.7 or newer' if predicts == 'probs': predicts = 'probabilities' assert predicts in ['probabilities', 'logits'] inputs = model.input labels = K.placeholder(shape=(None, )) predictions = model.output shape = K.int_shape(predictions) _, num_classes = shape assert num_classes is not None self._num_classes = num_classes if predicts == 'probabilities': if K.backend() == 'tensorflow': predictions, = predictions.op.inputs loss = K.sparse_categorical_crossentropy(labels, predictions, from_logits=True) else: # pragma: no cover logging.warning( 'relying on numerically unstable conversion from probabilities to softmax' ) loss = K.sparse_categorical_crossentropy(labels, predictions, from_logits=False) # transform the probability predictions into logits, so that # the rest of this code can assume predictions to be logits predictions = self._to_logits(predictions) elif predicts == 'logits': loss = K.sparse_categorical_crossentropy(labels, predictions, from_logits=True) loss = K.sum(loss, axis=0) gradient, = K.gradients(loss, [inputs]) backward_grad_logits = K.placeholder(shape=predictions.shape) backward_loss = K.sum(K.batch_dot(predictions, backward_grad_logits, axes=-1), axis=0) backward_grad_inputs, = K.gradients(backward_loss, [inputs]) self._loss_fn = K.function([inputs, labels], [loss]) self._forward_fn = K.function([inputs], [predictions]) self._gradient_fn = K.function([inputs, labels], [gradient]) self._backward_fn = K.function([backward_grad_logits, inputs], [backward_grad_inputs]) self._forward_and_gradient_fn = K.function([inputs, labels], [predictions, gradient])
def L2X(datatype, train=True): # the whole thing is equation (5) x_train, y_train, x_val, y_val, datatype_val, input_shape = create_data( datatype, n=int(1e6)) st1 = time.time() st2 = st1 print(input_shape) activation = 'relu' # P(S|X) we train the model on this, for capturing the important features. model_input = Input(shape=(input_shape, ), dtype='float32') net = Dense(100, activation=activation, name='s/dense1', kernel_regularizer=regularizers.l2(1e-3))(model_input) net = Dense(100, activation=activation, name='s/dense2', kernel_regularizer=regularizers.l2(1e-3))(net) # A tensor of shape, [batch_size, max_sents, 100] mid_dim = input_shape * num_groups logits = Dense(mid_dim)(net) # [BATCH_SIZE, max_sents, 1] k = ks[datatype] tau = 0.1 samples = Sample_Concrete(tau, k, input_shape, num_groups, name='sample')(logits) # samples = Reshape((num_groups, input_shape))(samples) samples = Reshape((input_shape, num_groups))(samples) samples = Permute((2, 1))(samples) #samples to be KD *1 and then make a matrix K*D and the K*D * D * 1 = K * 1 the new_model_input # 1) one nueral net that gives # 2) seperate neural net with one node as input. # q(X_S) variational family # new_model_input = Multiply()([model_input, samples]) # new_model_input = Dot(samples, model_input) def matmul_output_shape(input_shapes): shape1 = list(input_shapes[0]) shape2 = list(input_shapes[1]) return tuple((shape1[0], shape1[1])) matmul_layer = Lambda(lambda x: K.batch_dot(x[0], x[1]), output_shape=matmul_output_shape) new_model_input = matmul_layer([samples, model_input]) # bs, num_groups #### here we apply instance-wise feature selection again I(Xs;Y) net2 = Dense(100, activation=activation, name='g/dense1', kernel_regularizer=regularizers.l2(1e-3))(new_model_input) net2 = Dense(100, activation=activation, name='g/dense2', kernel_regularizer=regularizers.l2(1e-3))(net2) logits = Dense(num_groups)(net2) samples_grp = Sample_Concrete_Original(tau, num_important_groups, name='group_selection')(logits) new_model_input2 = Multiply()([new_model_input, samples_grp]) #net = Dense(200, activation=activation, name = 'dense2', # kernel_regularizer=regularizers.l2(1e-3))(new_model_input) #net = BatchNormalization()(net) net = Dense(32, activation=activation, name='dense1', kernel_regularizer=regularizers.l2(1e-3))(new_model_input2) net = BatchNormalization()(net) # Add batchnorm for stability. net = Dense(16, activation=activation, name='dense2', kernel_regularizer=regularizers.l2(1e-3))(net) net = BatchNormalization()(net) preds = Dense(2, activation='softmax', name='dense4', kernel_regularizer=regularizers.l2(1e-3))(net) #### HERE IS FOR ANOTHER BRANCH I(Xg;Y) net3 = Dense(100, activation=activation, name='g2/dense1', kernel_regularizer=regularizers.l2(1e-3))(new_model_input) net3 = Dense(100, activation=activation, name='g2/dense2', kernel_regularizer=regularizers.l2(1e-3))(net3) preds2 = Dense(2, activation='softmax', name='g2/dense4', kernel_regularizer=regularizers.l2(1e-3))(net3) model = Model(inputs=model_input, outputs=[preds, preds2]) model.summary() if train: adam = optimizers.Adam(lr=1e-3) #### HERE CHANGE THE PROPORTION OF 2 WEIGHTS l1 = 1.0 l2 = 1.0 model.compile( loss=['categorical_crossentropy', 'categorical_crossentropy'], loss_weights=[l1, l2], optimizer=adam, metrics=['acc']) filepath = "models/{}/L2X.hdf5".format(datatype) checkpoint = ModelCheckpoint(filepath, monitor='val_acc', verbose=1, save_best_only=True, mode='max') callbacks_list = [checkpoint] model.fit(x_train, [y_train, y_train], validation_data=(x_val, [y_val, y_val]), callbacks=callbacks_list, epochs=2, batch_size=BATCH_SIZE) st2 = time.time() else: model.load_weights('models/{}/L2X.hdf5'.format(datatype), by_name=True) pred_model = Model(model_input, [samples, samples_grp]) pred_model.compile(loss=None, optimizer='rmsprop', metrics=[None]) # For now samples is a matrix instead of a vector scores, scores_grp = pred_model.predict(x_val, verbose=1, batch_size=BATCH_SIZE) # We need to write a new compute_median_rank to do analysis # median_ranks = compute_median_rank(scores, k = ks[datatype], # datatype_val=datatype_val) median_ranks = compute_groups(scores) return median_ranks, time.time( ) - st2, st2 - st1, scores, scores_grp, x_val, y_val
def tangent_distance(signals, protos, subspaces, squared=False, epsilon=K.epsilon()): # Note: subspaces is always assumed as transposed and must be orthogonal! # shape(signals): batch x proto_number x channels x dim1 x dim2 x ... x dimN # shape(protos): proto_number x dim1 x dim2 x ... x dimN # shape(subspaces): (optional [proto_number]) x prod(dim1 * dim2 * ... * dimN) x prod(projected_atom_shape) # subspace should be orthogonalized signal_shape, signal_int_shape = _int_and_mixed_shape(signals) proto_shape, proto_int_shape = _int_and_mixed_shape(protos) subspace_int_shape = K.int_shape(subspaces) # check if the shapes are correct _check_shapes(signal_int_shape, proto_int_shape) with K.name_scope('tangent_distance'): atom_axes = list(range(3, len(signal_int_shape))) # for sparse signals, we use the memory efficient implementation if signal_int_shape[1] == 1: signals = K.reshape(signals, [-1, np.prod(signal_shape[3:])]) if len(atom_axes) > 1: protos = K.reshape(protos, [proto_shape[0], -1]) if K.ndim(subspaces) == 2: # clean solution without map_fn if the matrix_scope is global with K.name_scope('projectors'): projectors = K.eye(subspace_int_shape[-2]) - K.dot( subspaces, K.transpose(subspaces)) with K.name_scope('tangentspace_projections'): projected_signals = K.dot(signals, projectors) projected_protos = K.dot(protos, projectors) diss = euclidean_distance(projected_signals, projected_protos, squared=squared, epsilon=epsilon) diss = K.reshape( diss, [signal_shape[0], signal_shape[2], proto_shape[0]]) return K.permute_dimensions(diss, [0, 2, 1]) else: # no solution without map_fn possible --> memory efficient but slow! with K.name_scope('projectors'): projectors = K.eye(subspace_int_shape[-2]) - K.batch_dot( subspaces, subspaces, [2, 2]) with K.name_scope('tangentspace_projections'): projected_protos = K.transpose( K.batch_dot(projectors, protos, [1, 1])) with K.name_scope('euclidean_distance'): def projected_norm(projector): return K.sum(K.square(K.dot(signals, projector)), axis=1) diss = K.transpose(K.map_fn(projected_norm, projectors)) \ - 2 * K.dot(signals, projected_protos) \ + K.sum(K.square(projected_protos), axis=0, keepdims=True) if not squared: if epsilon == 0: diss = K.sqrt(diss) else: diss = K.sqrt(K.maximum(diss, epsilon)) diss = K.reshape( diss, [signal_shape[0], signal_shape[2], proto_shape[0]]) return K.permute_dimensions(diss, [0, 2, 1]) else: signals = K.permute_dimensions(signals, [0, 2, 1] + atom_axes) diff = signals - protos # global tangent space if K.ndim(subspaces) == 2: with K.name_scope('projectors'): projectors = K.eye(subspace_int_shape[-2]) - K.dot( subspaces, K.transpose(subspaces)) with K.name_scope('tangentspace_projections'): diff = K.reshape(diff, (signal_shape[0] * signal_shape[2], signal_shape[1], -1)) projected_diff = K.dot(diff, projectors) projected_diff = K.reshape( projected_diff, (signal_shape[0], signal_shape[2], signal_shape[1]) + signal_shape[3:]) diss = p_norm(projected_diff, order_p=2, axis=atom_axes, squared=squared, keepdims=False, epsilon=epsilon) return K.permute_dimensions(diss, [0, 2, 1]) # local tangent spaces else: with K.name_scope('projectors'): projectors = K.eye(subspace_int_shape[-2]) - K.batch_dot( subspaces, subspaces, [2, 2]) with K.name_scope('tangentspace_projections'): diff = K.reshape(diff, (signal_shape[0] * signal_shape[2], signal_shape[1], -1)) diff = K.permute_dimensions(diff, [1, 0, 2]) projected_diff = K.batch_dot(diff, projectors) projected_diff = K.reshape( projected_diff, (signal_shape[1], signal_shape[0], signal_shape[2]) + signal_shape[3:]) diss = p_norm(projected_diff, order_p=2, axis=atom_axes, squared=squared, keepdims=False, epsilon=epsilon) return K.permute_dimensions(diss, [1, 0, 2])
def call(self, inputs): #Ordinary Conv2D Convolution kernel outputs = K.conv2d(inputs, self.kernel, strides=self.strides, padding=self.padding, data_format='channels_last', dilation_rate=self.dilation_rate) if self.use_bias: outputs = K.bias_add(outputs, self.bias, data_format='channels_last') if self.activation is not None: outputs = self.activation(outputs) #Add second part of semi-convolutional operator shape = K.shape(outputs) shape = [shape[i] for i in range(4)] batch_size, x_dim, y_dim, c1 = shape #Create tensors containng x/y pixel locations xx_ones = K.ones([batch_size, x_dim], dtype='int32') xx_ones = K.expand_dims(xx_ones, -1) xx_range = K.tile(K.expand_dims(K.arange(x_dim), 0), [batch_size, 1]) xx_range = K.expand_dims(xx_range, 1) xx_channel = K.batch_dot(xx_ones, xx_range) xx_channel = K.expand_dims(xx_channel, -1) xx_channel = K.cast(xx_channel, 'float32') if self.normalized_position: xx_channel = xx_channel / (K.cast(x_dim, 'float32') - 1) xx_channel = xx_channel * 2 - 1 yy_ones = K.ones([batch_size, y_dim], dtype='int32') yy_ones = K.expand_dims(yy_ones, 1) yy_range = K.tile(K.expand_dims(K.arange(y_dim), 0), [batch_size, 1]) yy_range = K.expand_dims(yy_range, -1) yy_channel = K.batch_dot(yy_range, yy_ones) yy_channel = K.expand_dims(yy_channel, -1) yy_channel = K.cast(yy_channel, 'float32') if self.normalized_position: yy_channel = yy_channel / (K.cast(x_dim, 'float32') - 1) yy_channel = yy_channel * 2 - 1 #Concat global x and y location semi_tensor = K.concatenate([xx_channel, yy_channel], axis=-1) #Apply Lambda function if self.function is not None: semi_tensor = self.function(semi_tensor, self.normalized_position, **self.arguments) c2 = K.shape(semi_tensor)[-1] #Pad with "zero" channels semi_tensor = K.concatenate( [semi_tensor, K.zeros([batch_size, x_dim, y_dim, c1 - c2])], axis=-1) #Sum the convolutional output with the semi_tensor joint_outputs = outputs + semi_tensor return joint_outputs #, semi_tensor, outputs
def kl_dist(vects): qry_vec, doc_vec = vects qry_vec = K.clip(qry_vec, K.epsilon(), 1) doc_vec = K.clip(doc_vec, K.epsilon(), 1) dist = K.batch_dot(qry_vec, K.log(doc_vec), 1) return dist
def call(self, inputs): # 定义正式执行的函数 init_states = [ K.zeros((K.shape(inputs)[0], K.shape(inputs)[-1])), K.zeros((K.shape(inputs)[0], K.shape(inputs)[-1])) ] # 定义初始态(全零) #init_states = [inputs[:,0], inputs[:,0]] #print('inputs',K.shape(inputs)[0]) outputs = K.rnn(self.step_do, inputs, init_states, unroll=False) # 循环执行step_do函数 #print('outputs[1]',outputs.shape) print('outputs[0].shape', outputs[0].shape) query1 = K.dot(outputs[1], self.query_kernel1) key1 = K.dot(outputs[1], self.key_kernel1) value1 = K.dot(outputs[1], self.value_kernel1) attention_prob1 = K.batch_dot(query1, key1, axes=[2, 2]) / np.sqrt( self.units) attention_prob1 = K.softmax(attention_prob1) att_out1 = K.batch_dot(attention_prob1, value1, axes=[2, 1]) query2 = K.dot(outputs[1], self.query_kernel2) key2 = K.dot(outputs[1], self.key_kernel2) value2 = K.dot(outputs[1], self.value_kernel2) attention_prob2 = K.batch_dot(query2, key2, axes=[2, 2]) / np.sqrt( self.units) attention_prob2 = K.softmax(attention_prob2) att_out2 = K.batch_dot(attention_prob2, value2, axes=[2, 1]) query3 = K.dot(outputs[1], self.query_kernel3) key3 = K.dot(outputs[1], self.key_kernel3) value3 = K.dot(outputs[1], self.value_kernel3) attention_prob3 = K.batch_dot(query3, key3, axes=[2, 2]) / np.sqrt( self.units) attention_prob3 = K.softmax(attention_prob3) att_out3 = K.batch_dot(attention_prob3, value3, axes=[2, 1]) query4 = K.dot(outputs[1], self.query_kernel4) key4 = K.dot(outputs[1], self.key_kernel4) value4 = K.dot(outputs[1], self.value_kernel4) attention_prob4 = K.batch_dot(query4, key4, axes=[2, 2]) / np.sqrt( self.units) attention_prob4 = K.softmax(attention_prob4) att_out4 = K.batch_dot(attention_prob4, value4, axes=[2, 1]) att_out = K.concatenate([att_out1, att_out2, att_out3, att_out4], axis=-1) out = K.dot(att_out, self.switch_kernel) return out[:, -1]
def call(self, X, mask=None): assert isinstance(X, list) and len( X) >= 2, "Bad input expecting list of input,encoder,decoder" if (len(X) == 3): x_T, e_T, d_T = X elif (len(X) == 2): x_T, e_T = X d_T = e_T # (batch_size ,sequence_len, feature_dim) -> (batch_size ,feature_dim,sequence_len) x = K.permute_dimensions(x_T, (0, 2, 1)) # print("SHAPE!!!!", K.eval(x.shape)) # x_T = theano.printing.Print('x_T',attrs=['shape'])(x_T) if (K.backend() == "tensorflow"): assert self.seq_len != None, 'Must set Ptr_Layer(seq_len=?) if using Tensorflow' seq_len = self.seq_len else: seq_len = K.shape(e_T)[1] # Shape key: # x_T: #(batch_size ,sequence_len, feature_dim) # e_T: #(batch_size ,sequence_len, recurrent_dim) # d_T: #(batch_size ,sequence_len, recurrent_dim) # (batch_size ,sequence_len, recurrent_dim) * (recurrent_dim,att_dim) -> #(batch_size ,sequence_len,att_dim) _e_T, _d_T = K.dot(e_T, K.transpose(self.W1)), K.dot( d_T, K.transpose(self.W2)) # (batch_size ,sequence_len, att_dim) _e, _d = K.permute_dimensions(_e_T, (0, 2, 1)), K.permute_dimensions( _d_T, (0, 2, 1)) # (batch_size ,att_dim, sequence_len) # _e = theano.printing.Print('_e', attrs=['shape'])(_e) # _d = theano.printing.Print('_d', attrs=['shape'])(_d) def Tmap(fn, arrays, dtype='float32'): # assumes all arrays have same leading dim indices = K.range(K.shape(arrays[0])[0]) out = K.map_fn(lambda ii: fn(*[array[ii] for array in arrays]), indices, dtype=dtype) return out if (self.implementation == 'ptr_net'): print("PTR_NET") E_T = K.repeat_elements( K.expand_dims(_e_T, dim=1), seq_len, axis=1) # (batch_size ,sequence_len, sequence_len, att_dim) D_T = K.repeat_elements( K.expand_dims(_d_T, dim=1), seq_len, axis=1) # (batch_size ,sequence_len, sequence_len, att_dim) D = K.permute_dimensions( D_T, (0, 2, 1, 3)) # (batch_size ,sequence_len, sequence_len, att_dim) u = K.squeeze(K.dot(K.tanh(E_T + D), self.v), axis=-1) # (batch_size ,sequence_len, sequence_len) u = K.permute_dimensions(u, (0, 2, 1)) # axis=2 is row axis therefore u*x has columns that are linear combos of x u = softmax(u, axis=2) # (batch_size ,sequence_len, sequence_len) elif (self.implementation == 'ptr_net_scan'): def _ptr_net_u(_e_T, _d_T): __E_T = K.repeat_elements( K.expand_dims(_e_T, dim=0), seq_len, axis=0) # (sequence_len, sequence_len, att_dim) __D_T = K.repeat_elements( K.expand_dims(_d_T, dim=0), seq_len, axis=0) # (sequence_len, sequence_len, att_dim) __D = K.permute_dimensions( __D_T, (1, 0, 2)) # (sequence_len, sequence_len, att_dim) u = K.dot(K.tanh(__E_T + __D), self.v) # (sequence_len, sequence_len) u = K.squeeze(u, axis=-1) u = K.permute_dimensions(u, (1, 0)) u = softmax(u, axis=1) # (sequence_len, sequence_len) return u assert K.backend( ) == 'tensorflow', 'ptr_net_scan only works with tensorflow backend' import tensorflow as tf u = tf.map_fn(lambda x: _ptr_net_u(x[0], x[1]), (_e_T, _d_T), dtype=tf.float32) elif (self.implementation == 'custom'): # only onto if att_dim == sequence_len u = _e + _d_T ## (batch_size ,att_dim, att_dim) u = softmax(u, axis=2) ## (batch_size ,att_dim, att_dim) elif (self.implementation == 'custom_T'): u = _e_T + _d ## (batch_size ,att_dim, att_dim) u = softmax(u, axis=2) ## (batch_size ,att_dim, att_dim) else: raise ValueError("implementation not recognized: %r" % self.implementation) self.add_loss(giniSparsity(u, self.sparsity_coeff)) soft_sorted_x = K.batch_dot(u, x, axes=[1, 2]) # x_T = K.permute_dimensions(soft_sorted_x, (0, 2, 1)) return soft_sorted_x # +K.sum(K.sum(K.sum(u)))#+ K.sum(K.sum(K.sum(_e))) + K.sum(K.sum(K.sum(_d)))