def teacher_forced(h, states): # switching from (batch_size, previous_layer_input|true_input, output_dim) # to ( previous_layer_input|true_input, batch_size, output_dim) axes = [1, 0] + list(range(2, K.ndim(h))) h = K.permute_dimensions(h, axes) prev_layer_input = h[0:1, :, :] true_input = h[1:, :, :self.units] # this should correspond to true input prev_sampled_output = true_input if self.implementation == 0: x_z = prev_layer_input[0, :, :self.units] x_r = prev_layer_input[0, :, self.units: 2 * self.units] x_h = prev_layer_input[0, :, 2 * self.units:] else: raise ValueError('Implementation type ' + self.implementation + ' is invalid') z = self.recurrent_activation(x_z + K.dot(h_tm1 * rec_dp_mask[0], self.recurrent_kernel_z)) r = self.recurrent_activation(x_r + K.dot(h_tm1 * rec_dp_mask[1], self.recurrent_kernel_r)) hh = self.activation(x_h + K.dot(r * h_tm1 * rec_dp_mask[2], self.recurrent_kernel_h) + K.dot(r * prev_sampled_output, self.recurrent_kernel_y)) output = z * h_tm1 + (1. - z) * hh return K.stack([output, output])
def call(self, inputs, **kwargs): assert isinstance(inputs, list) and len(inputs) == 3 first, second, features = inputs[0], inputs[1], inputs[2] if not self.from_logits: first = kb.clip(first, 1e-10, 1.0) second = kb.clip(second, 1e-10, 1.0) first_, second_ = kb.log(first), kb.log(second) else: first_, second_ = first, second # embedded_features.shape = (M, T, 1) if self.use_intermediate_layer: features = kb.dot(features, self.first_kernel) features = kb.bias_add(features, self.first_bias, data_format="channels_last") features = self.intermediate_activation(features) embedded_features = kb.dot(features, self.features_kernel) embedded_features = kb.bias_add( embedded_features, self.features_bias, data_format="channels_last") if self.use_dimension_bias: tiling_shape = [1] * (kb.ndim(first)-1) + [kb.shape(first)[-1]] embedded_features = kb.tile(embedded_features, tiling_shape) embedded_features = kb.bias_add( embedded_features, self.dimensions_bias, data_format="channels_last") sigma = kb.sigmoid(embedded_features) result = weighted_sum(first_, second_, sigma, self.first_threshold, self.second_threshold) probs = kb.softmax(result) if self.return_logits: return [probs, result] return probs
def free_running(h, states): prev_generated_output = initial_states[0][1:, :, :] prev_sampled_output = prev_generated_output # switching from (batch_size, previous_layer_input|true_input, output_dim) # to ( previous_layer_input|true_input, batch_size, output_dim) axes = [1, 0] + list(range(2, K.ndim(h))) h = K.permute_dimensions(h, axes) prev_layer_input = h[0:1, :, :] if self.implementation == 0: x_z = prev_layer_input[0, :, :self.units] x_r = prev_layer_input[0, :, self.units: 2 * self.units] x_h = prev_layer_input[0, :, 2 * self.units:] z = self.recurrent_activation(x_z + K.dot(h_tm1 * rec_dp_mask[0], self.recurrent_kernel_z)) r = self.recurrent_activation(x_r + K.dot(h_tm1 * rec_dp_mask[1], self.recurrent_kernel_r)) hh = self.activation(x_h + K.dot(r * h_tm1 * rec_dp_mask[2], self.recurrent_kernel_h) + K.dot(r * prev_sampled_output, self.recurrent_kernel_y)) output = z * h_tm1 + (1. - z) * hh final_output = self.output_sampling(output, random_cutoff_vec) return K.stack([output, final_output])
def get_variational_regularization(self, X): mean = self.activation(K.dot(X, self.W_mean) + self.b_mean) logsigma = self.activation(K.dot(X, self.W_logsigma) + self.b_logsigma) return GaussianKL(mean, logsigma, regularizer_scale=self.regularizer_scale, prior_mean=self.prior_mean, prior_logsigma=self.prior_logsigma)
def call(self, inputs, **kwargs): gate = kb.dot(inputs, self.gate_kernel) gate = kb.bias_add(gate, self.gate_bias, data_format="channels_last") gate = self.activation(gate) new_value = kb.dot(inputs, self.dense_kernel) new_value = kb.bias_add(new_value, self.dense_bias, data_format="channels_last") return gate * new_value + (1.0 - gate) * inputs
def step(self, x, states): # states only contains the previous output. assert len(states) == 1 prev_output = states[0] h = K.dot(x, self.W) + self.b output = self.activation(h + K.dot(prev_output, self.U)) return output, [output]
def step(self, x, states): h = states[0] # states[1] necessary? # comes from the constants X_static = states[-2] # equals K.dot(static_x, self._W1) + self._b2 with X.shape=[bs, L, static_input_dim] total_x_static_prod = states[-1] # expand dims to add the vector which is only valid for this time step # to total_x_prod which is valid for all time steps hw = K.expand_dims(K.dot(h, self._W2), 1) additive_atn = total_x_static_prod + hw attention = K.softmax(K.dot(additive_atn, self._V), axis=1) static_x_weighted = K.sum(attention * X_static, [1]) x = K.dot(K.concatenate([x, static_x_weighted], 1), self._W3) + self._b3 h, new_states = self.layer.cell.call(x, states[:-2]) # append attention to the states to "smuggle" it out of the RNN wrapper attention = K.squeeze(attention, -1) h = K.concatenate([h, attention]) return h, new_states
def call(self, x): #如果只传入Q_seq,K_seq,V_seq,那么就不做Mask #如果同时传入Q_seq,K_seq,V_seq,Q_len,V_len,那么对多余部分做Mask if len(x) == 3: Q_seq,K_seq,V_seq = x Q_len,V_len = None,None elif len(x) == 5: Q_seq,K_seq,V_seq,Q_len,V_len = x #对Q、K、V做线性变换 Q_seq = K.dot(Q_seq, self.WQ) Q_seq = K.reshape(Q_seq, (-1, K.shape(Q_seq)[1], self.nb_head, self.size_per_head)) Q_seq = K.permute_dimensions(Q_seq, (0,2,1,3)) K_seq = K.dot(K_seq, self.WK) K_seq = K.reshape(K_seq, (-1, K.shape(K_seq)[1], self.nb_head, self.size_per_head)) K_seq = K.permute_dimensions(K_seq, (0,2,1,3)) V_seq = K.dot(V_seq, self.WV) V_seq = K.reshape(V_seq, (-1, K.shape(V_seq)[1], self.nb_head, self.size_per_head)) V_seq = K.permute_dimensions(V_seq, (0,2,1,3)) #计算内积,然后mask,然后softmax A = K.batch_dot(Q_seq, K_seq, axes=[3,3]) A = K.permute_dimensions(A, (0,3,2,1)) A = self.Mask(A, V_len, 'add') A = K.permute_dimensions(A, (0,3,2,1)) A = K.softmax(A) #输出并mask O_seq = K.batch_dot(A, V_seq, axes=[3,2]) O_seq = K.permute_dimensions(O_seq, (0,2,1,3)) O_seq = K.reshape(O_seq, (-1, K.shape(O_seq)[1], self.output_dim)) O_seq = self.Mask(O_seq, Q_len, 'mul') return O_seq
def call(self, x): assert(K.backend() == 'tensorflow') temp = K.permute_dimensions(x, (0, 2, 1)) for i in range(0, self.attention_depth): temp = K.sigmoid(K.dot(temp, self.Ws[i]) + self.bs[i]) temp = K.permute_dimensions(temp, (0, 2, 1)) estimated_weight = K.squeeze(K.dot(temp, K.expand_dims(self.Wf, -1)), -1) biased_weight = estimated_weight + self.bias non_linear_weight = K.tanh(biased_weight) # For each hidded state calculate how much should it contribute # to the context vector. This is the main part of attention. # In order to convert weights to "probabilities" use a sigmoid # based function: exp(x) / sum(exp(xi)). prob = K.exp(non_linear_weight) # Compute the total sum for each batch. total_sum = K.sum(prob, axis=1, keepdims=True) prob /= K.cast(total_sum, K.floatx()) # Enable this if you want access to internal probabilities. # Should only be used for testing that Attention works as expected. # return prob # Multiply each hidden value by the corresponding probability. prob = K.expand_dims(prob, -1) new_hidden_values = x * prob return K.sum(new_hidden_values, axis=1)
def step(self, x, states): r_tm1, V_tm1,s_tm1,time = states[:4] h_tm1 = states[4:] r_tm1 = r_tm1 op_t, h_t = _update_controller(self, T.concatenate([x, r_tm1], axis=-1), h_tm1) # op_t = op_t + print_name_shape("W_d",self.W_d.get_value()) op_t = op_t #op_t = op_t[:,0,:] d_t = K.sigmoid( K.dot(op_t, self.W_d) + self.b_d) u_t = K.sigmoid(K.dot(op_t, self.W_u) + self.b_u) v_t = K.tanh(K.dot(op_t, self.W_v) + self.b_v) o_t = K.tanh(K.dot(op_t, self.W_o) + self.b_o) time = time + 1 V_t, s_t, r_t = _update_neural_stack(self, V_tm1, s_tm1, d_t[::,0], u_t[::,0], v_t,time[0],stack=self.stack) return o_t, [r_t, V_t, s_t, time] + h_t
def get_variational_regularization(self, X): X = K.reshape(X, (-1, self.input_shape[-1])) mean = self.activation(K.dot(X, self.W_mean) + self.b_mean) logsigma = self.activation(K.dot(X, self.W_logsigma) + self.b_logsigma) return GaussianKL(mean, logsigma, regularizer_scale=self.regularizer_scale, prior_mean=self.prior_mean, prior_logsigma=self.prior_logsigma)
def call(self, inputs, states, constants): [prev_output] = states [constant] = constants h_input = K.dot(inputs, self.input_kernel) h_state = K.dot(prev_output, self.recurrent_kernel) h_const = K.dot(constant, self.constant_kernel) output = h_input + h_state + h_const return output, [output]
def get_output(self, train=False): X = self.get_input(train) if self.pretrain or self.output_reconstruction: output = self.reconstruction_activation(K.dot(self.activation(K.dot(X, self.W)), K.transpose(self.W))) return output else: output = self.activation(K.dot(X, self.W)) return output
def call(self, x, mask=None): # input shape: (nb_samples, time (padded with zeros), input_dim) # note that the .build() method of subclasses MUST define # self.input_spec with a complete input shape. input_shape = self.input_spec[0].shape if K._BACKEND == 'tensorflow': if not input_shape[1]: raise Exception('When using TensorFlow, you should define ' 'explicitly the number of timesteps of ' 'your sequences.\n' 'If your first layer is an Embedding, ' 'make sure to pass it an "input_length" ' 'argument. Otherwise, make sure ' 'the first layer has ' 'an "input_shape" or "batch_input_shape" ' 'argument, including the time axis. ' 'Found input shape at layer ' + self.name + ': ' + str(input_shape)) if self.stateful: initial_states = self.states else: initial_states = self.get_initial_states(x) constants = self.get_constants(x) preprocessed_input = self.preprocess_input(x) last_output, outputs_0, states = K.rnn(self.step, preprocessed_input, initial_states, go_backwards=self.go_backwards, mask=mask, constants=constants, unroll=self.unroll, input_length=input_shape[1]) timer = K.zeros((2, self.output_length, 2)) last_output, outputs, states = K.rnn(self.dream, timer, states, go_backwards=self.go_backwards, mask=mask, constants=constants, input_length=self.output_length, unroll=self.unroll) last_output = K.dot(last_output, self.V) + self.ext_b outputs = K.concatenate([outputs_0, outputs], axis=1) outputs = K.dot(K.reshape(outputs, (-1, self.output_dim)), self.V) + self.ext_b ishape = K.shape(x) if K._BACKEND == "tensorflow": ishape = x.get_shape().as_list() outputs = K.reshape(outputs, (-1, ishape[1]+self.output_length, ishape[2])) if self.stateful: self.updates = [] for i in range(len(states)): self.updates.append((self.states[i], states[i])) if self.return_sequences: return outputs else: return last_output
def call(self, x, mask=None): N_DECISION = (2 ** (self.n_depth)) - 1 # Number of decision nodes N_LEAF = 2 ** (self.n_depth + 1) # Number of leaf nodes flat_decision_p_e = [] leaf_p_e = [] for w_d, w_l in zip(self.w_d_ensemble, self.w_l_ensemble): decision_p = K.sigmoid((K.dot(x, w_d))) leaf_p = K.softmax(w_l) decision_p_comp = 1 - decision_p decision_p_pack = K.concatenate([decision_p, decision_p_comp]) flat_decision_p_e.append(decision_p_pack) leaf_p_e.append(leaf_p) #Construct tiling pattern for decision probability matrix #Could be done in TF, but I think it's better statically tiling_pattern = np.zeros((N_LEAF, self.n_depth), dtype=np.int32) comp_offset = N_DECISION dec_idx = 0 for n in xrange(self.n_depth): j = 0 for depth_idx in xrange(2**n): repeat_times = 2 ** (self.n_depth - n) for _ in xrange(repeat_times): tiling_pattern[j][n] = dec_idx j = j + 1 for _ in xrange(repeat_times): tiling_pattern[j][n] = comp_offset + dec_idx j = j + 1 dec_idx = dec_idx + 1 flat_pattern = tiling_pattern.flatten() # iterate over each tree tree_ret = None for flat_decision_p, leaf_p in zip(flat_decision_p_e, leaf_p_e): flat_mu = tf.transpose(tf.gather(tf.transpose(flat_decision_p), flat_pattern)) batch_size = tf.shape(flat_decision_p)[0] shape = tf.pack([batch_size, N_LEAF, self.n_depth]) mu = K.reshape(flat_mu, shape) leaf_prob = K.prod(mu, [2]) prob_label = K.dot(leaf_prob, leaf_p) if tree_ret is None: tree_ret = prob_label else: tree_ret = tree_ret + prob_label return tree_ret/self.n_trees
def dream(self, x, states): prev_st = states[0] prev_x = tf.stop_gradient(K.dot(prev_st, self.V) + self.ext_b) B_U = states[1] B_W = states[2] h = K.dot(prev_x * B_W, self.W) + self.b output = self.activation(h + K.dot(prev_st * B_U, self.U)) return output, [output]
def call(self, x, mask=None): # x[0]: (batch_size, input_length, input_dim) # x[1]: (batch_size, 1) indices of prepositions # Optional: x[2]: (batch_size, input_length - 2) assert isinstance(x, list) or isinstance(x, tuple) encoded_sentence = x[0] prep_indices = K.squeeze(x[1], axis=-1) #(batch_size,) batch_indices = K.arange(K.shape(encoded_sentence)[0]) # (batch_size,) if self.with_attachment_probs: # We're essentially doing K.argmax(x[2]) here, but argmax is not differentiable! head_probs = x[2] head_probs_padding = K.zeros_like(x[2])[:, :2] # (batch_size, 2) # (batch_size, input_length) padded_head_probs = K.concatenate([head_probs, head_probs_padding]) # (batch_size, 1) max_head_probs = K.expand_dims(K.max(padded_head_probs, axis=1)) # (batch_size, input_length, 1) max_head_prob_indices = K.expand_dims(K.equal(padded_head_probs, max_head_probs)) # (batch_size, input_length, input_dim) masked_head_encoding = K.switch(max_head_prob_indices, encoded_sentence, K.zeros_like(encoded_sentence)) # (batch_size, input_dim) head_encoding = K.sum(masked_head_encoding, axis=1) else: head_indices = prep_indices - 1 # (batch_size,) head_encoding = encoded_sentence[batch_indices, head_indices, :] # (batch_size, input_dim) prep_encoding = encoded_sentence[batch_indices, prep_indices, :] # (batch_size, input_dim) child_encoding = encoded_sentence[batch_indices, prep_indices+1, :] # (batch_size, input_dim) ''' prep_indices = x[1] sentence_mask = mask[0] if sentence_mask is not None: if K.ndim(sentence_mask) > 2: # This means this layer came after a Bidirectional layer. Keras has this bug which # concatenates input masks instead of output masks. # TODO: Fix Bidirectional instead. sentence_mask = K.any(sentence_mask, axis=(-2, -1)) head_encoding, prep_encoding, child_encoding = self.get_split_averages(encoded_sentence, sentence_mask, prep_indices) ''' head_projection = K.dot(head_encoding, self.proj_head) # (batch_size, proj_dim) prep_projection = K.dot(prep_encoding, self.proj_prep) # (batch_size, proj_dim) child_projection = K.dot(child_encoding, self.proj_child) # (batch_size, proj_dim) #(batch_size, proj_dim) if self.composition_type == 'HPCT': composed_projection = K.tanh(head_projection + prep_projection + child_projection) elif self.composition_type == 'HPC': prep_child_projection = K.tanh(prep_projection + child_projection) # (batch_size, proj_dim) composed_projection = K.tanh(head_projection + prep_child_projection) else: # Composition type in HC composed_projection = K.tanh(head_projection + child_projection) for hidden_layer in self.hidden_layers: composed_projection = K.tanh(K.dot(composed_projection, hidden_layer)) # (batch_size, proj_dim) # (batch_size, num_classes) class_scores = K.dot(composed_projection, self.scorer) label_probabilities = K.softmax(class_scores) return label_probabilities
def call(self, argument, mask=None): """Execute this layer on input tensors. Parameters ---------- argument: list List of two tensors (X, Xp). X should be of shape (n_test, n_feat) and Xp should be of shape (n_support, n_feat) where n_test is the size of the test set, n_support that of the support set, and n_feat is the number of per-atom features. Returns ------- list Returns two tensors of same shape as input. Namely the output shape will be [(n_test, n_feat), (n_support, n_feat)] """ x, xp = argument # Get initializations p = self.p_init q = self.q_init # Rename support z = xp states = self.support_states_init x_states = self.test_states_init for d in range(self.max_depth): # Process support xp using attention e = cos(z + q, xp) a = K.softmax(e) # Get linear combination of support set r = K.dot(a, xp) # Not sure if it helps to place the update here or later yet. Will # decide # z = r # Process test x using attention x_e = cos(x + p, z) x_a = K.softmax(x_e) s = K.dot(x_a, z) # Generate new support attention states qr = K.concatenate([q, r], axis=1) q, states = self.support_lstm([qr] + states) # Generate new test attention states ps = K.concatenate([p, s], axis=1) p, x_states = self.test_lstm([ps] + x_states) # Redefine z = r # return [x+p, z+q] return [x + p, xp + q]
def dream(self, x, states): prev_st = states[0] controls = x[:, :self.control_dim] prev_x = K.concatenate([controls, tf.stop_gradient(K.dot(prev_st, self.V) + self.ext_b)], axis=1) B_U = states[1] B_W = states[2] h = K.dot(prev_x * B_W, self.W) + self.b output = self.activation(h + K.dot(prev_st * B_U, self.U)) return output, [output]
def __call__(self, x): regularization = 0 dimorder = self.axis + list(set(range(K.ndim(x))) - set(self.axis)) x = K.permute_dimensions(x, dimorder) x = x.reshape((x.shape[0], -1)) x -= K.mean(x, axis=1, keepdims=True) if self.division_idx is not None: regularization += .5*K.sum(K.square(K.dot(x[:self.division_idx], x[self.division_idx:].T)/x.shape[1])) else: regularization += .5*K.sum(K.square(K.dot(x, x.T)/x.shape[1])) return regularization
def __call__(self, x): xshape = K.int_shape(x) if self.axis is 'last': x = K.reshape(x, (-1, xshape[-1])) x /= K.sqrt(K.sum(K.square(x), axis=0, keepdims=True)) xx = K.dot(K.transpose(x), x) return self.gamma * K.sum(K.log(1.0 + K.exp(self.lam * (xx - 1.0))) * (1.0 - K.eye(xshape[-1]))) elif self.axis is 'first': x = K.reshape(x, (xshape[0], -1)) x /= K.sqrt(K.sum(K.square(x), axis=1, keepdims=True)) xx = K.dot(x, K.transpose(x)) return self.gamma * K.sum(K.log(1.0 + K.exp(self.lam * (xx - 1.0))) * (1.0 - K.eye(xshape[0])))
def step(self, x, states): prev_output = states[0] B_U = states[1] B_W = states[2] if self.consume_less == 'cpu': h = x else: h = K.dot(x * B_W, self.W) + self.b output = self.activation(h + K.dot(prev_output * B_U, self.U)) return output, [output]
def call(self, x, mask=None): x_cont, x_ques, ques_len = x input_shape_ = x_cont.shape.as_list() x_cont_ = tf.nn.relu(K.dot(x_cont, self.WC)) x_ques_ = tf.nn.relu(K.dot(x_ques, self.WQ)) logits = tf.matmul(x_cont_, x_ques_, transpose_b=True) / (self.filters ** 0.5) logits = self.mask_logits(logits, ques_len, clen=input_shape_[1]) logits = tf.nn.softmax(logits) C = tf.matmul(logits, x_ques) res = tf.concat([x_cont, C], axis=2) gate = tf.nn.sigmoid(K.dot(res, self.V)) return gate
def free_energy(self, x): """ Compute free energy for Bernoulli RBM, given visible units. The marginal probability p(x) = sum_h 1/Z exp(-E(x, h)) can be re-arranged to the form p(x) = 1/Z exp(-F(x)), where the free energy F(x) = -sum_j=1^H log(1 + exp(x^T W[:,j] + bh_j)) - bx^T x, in case of the Bernoulli RBM energy function. """ wx_b = K.dot(x, self.W) + self.bh hidden_term = K.sum(K.log(1 + K.exp(wx_b)), axis=1) vbias_term = K.dot(x, self.bx) return -hidden_term - vbias_term
def call(self, x): y = K.dot(x, self.W_carry) if self.bias: y += self.b_carry transform_weight = activations.sigmoid(y) y = K.dot(x, self.W) if self.bias: y += self.b act = self.activation(y) act *= transform_weight output = act + (1 - transform_weight) * x return output
def __loss(y_true, y_pred): kernel_cs_forward, kernel_cs_backward = [], [] for (forward, backward) in layers: kernel_c_forward = forward.cell.trainable_weights[1][:, rnn_units * 2:rnn_units * 3] kernel_c_backward = backward.cell.trainable_weights[1][:, rnn_units * 2:rnn_units * 3] kernel_cs_forward.append(K.reshape(kernel_c_forward, (rnn_units * rnn_units,))) kernel_cs_backward.append(K.reshape(kernel_c_backward, (rnn_units * rnn_units,))) phi_forward = K.stack(kernel_cs_forward) phi_backward = K.stack(kernel_cs_backward) loss_sim_forward = K.sum(K.square(K.dot(phi_forward, K.transpose(phi_forward)) - K.eye(len(layers)))) loss_sim_backward = K.sum(K.square(K.dot(phi_backward, K.transpose(phi_backward)) - K.eye(len(layers)))) loss_cat = keras.losses.categorical_crossentropy(y_true, y_pred) return loss_cat + lmbd * (loss_sim_forward + loss_sim_backward)
def call(self, x, mask=None): e = K.dot(x, self.W) if self.bias: e += self.b e = K.tanh(e) e = K.reshape(K.dot(e, self.U), (-1, self.timesteps)) a = K.exp(e) if mask is not None: a *= K.cast(mask, K.floatx()) a_weights = a / K.cast(K.sum(a, axis=-1, keepdims=True) + K.epsilon(), K.floatx()) weighted_output = x * K.expand_dims(a_weights, axis=-1) return [K.mean(weighted_output, axis=1), a_weights]
def step(self, x, states): assert len(states) == 5, len(states) states = list(states) y_tm1 = states.pop(2) v = self.activation(K.dot(x, self.W_x) + self.b_x) y_tm1 += v output_dim = self.output_dim self.output_dim = self.hidden_dim h_t, new_states = super(LSTMDecoder, self).step(y_tm1, states) self.output_dim = output_dim y_t = self.activation(K.dot(h_t, self.W_y) + self.b_y) new_states += [y_t] return y_t, new_states
def step(self, x, states): h, [h, c] = self.layer.step(x, states) attention = states[4] m = self.attn_activation(K.dot(h, self.U_a) * attention + self.b_a) s = K.sigmoid(K.dot(m, self.U_s) + self.b_s) if self.single_attention_param: h = h * K.repeat_elements(s, self.layer.output_dim, axis=1) else: h = h * s return h, [h, c]
def dot_product(x, kernel): """ Wrapper for dot product operation, in order to be compatible with both Theano and Tensorflow Args: x (): input kernel (): weights Returns: """ if K.backend() == 'tensorflow': return K.squeeze(K.dot(x, K.expand_dims(kernel)), axis=-1) else: return K.dot(x, kernel)
### INITIALIZING CONSTANTS n_input = 272 tau = 0.1 lambda_step = 0.1 soft_thr = 0.1 conv_size = 32 filter_size = 3 ### PREPARING THE MODEL (An image of the model map has been attached) # Defining the input and output inp = Input((n_input,)) inp_labels = Input((1089, )) # Defining the input for the first ISTA block x0 = Lambda(lambda x: K.dot(x, K.constant(phi_inv)))(inp) phi_tb = Lambda(lambda x: K.dot(x, K.constant(np.transpose(phi))))(inp) # ISTA block #1 conv1_x1 = Lambda(lambda x: x - lambda_step * K.dot(x, K.constant(ptp)) + lambda_step * phi_tb, name='conv1_x1')(x0) conv1_x2 = Reshape((33, 33, 1), name='conv1_x2')(conv1_x1) conv1_x3 = Conv2D(conv_size, [filter_size, filter_size], padding='SAME', use_bias=False, name='conv1_x3')(conv1_x2) conv1_sl1 = Conv2D(conv_size, [filter_size, filter_size], padding='SAME', use_bias=False, activation='relu', name='conv1_sl1') conv1_x4 = conv1_sl1(conv1_x3) conv1_sl2 = Conv2D(conv_size, [filter_size, filter_size], padding='SAME', use_bias=False, name='conv1_sl2') conv1_x44 = conv1_sl2(conv1_x4) conv1_x5 = Multiply(name='conv1_x5')([Lambda(lambda x: K.sign(x))(conv1_x44), Lambda(lambda x: relu(x - soft_thr))(Lambda(lambda x: K.abs(x))(conv1_x44))]) conv1_sl3 = Conv2D(conv_size, [filter_size, filter_size], padding='SAME', use_bias=False, activation='relu', name='conv1_sl3') conv1_x6 = conv1_sl3(conv1_x5) conv1_sl4 = Conv2D(conv_size, [filter_size, filter_size], padding='SAME', use_bias=False, name='conv1_sl4') conv1_x66 = conv1_sl4(conv1_x6)
def step(self, inputs, states): h_tm1 = states[0] c_tm1 = states[1] dp_mask = states[2] rec_dp_mask = states[3] if self.implementation == 2: m1 = K.dot(inputs, self.multiplicative_kernel_x) m2 = K.dot(h_tm1, self.multiplicative_kernel_h) m = m1 * m2 z = K.dot(inputs * dp_mask[0], self.kernel) z += K.dot(m * rec_dp_mask[0], self.recurrent_kernel) if self.use_bias: z = K.bias_add(z, self.bias) z0 = z[:, :self.units] z1 = z[:, self.units:2 * self.units] z2 = z[:, 2 * self.units:3 * self.units] z3 = z[:, 3 * self.units:4 * self.units] i = self.recurrent_activation(z0) f = self.recurrent_activation(z1) c = f * c_tm1 + i * self.activation(z2) o = self.recurrent_activation(z3) else: if self.implementation == 0: inp = inputs[:, 4 * self.units:] m1 = K.dot(inp, self.multiplicative_kernel_x) m2 = K.dot(h_tm1, self.multiplicative_kernel_h) m = m1 * m2 x_i = inputs[:, :self.units] x_f = inputs[:, self.units:2 * self.units] x_c = inputs[:, 2 * self.units:3 * self.units] x_o = inputs[:, 3 * self.units:4 * self.units] elif self.implementation == 1: m1 = K.dot(inputs, self.multiplicative_kernel_x) m2 = K.dot(h_tm1, self.multiplicative_kernel_h) m = m1 * m2 x_i = K.dot(inputs * dp_mask[0], self.kernel_i) + self.bias_i x_f = K.dot(inputs * dp_mask[1], self.kernel_f) + self.bias_f x_c = K.dot(inputs * dp_mask[2], self.kernel_c) + self.bias_c x_o = K.dot(inputs * dp_mask[3], self.kernel_o) + self.bias_o else: raise ValueError('Unknown `implementation` mode.') i = self.recurrent_activation( x_i + K.dot(m * rec_dp_mask[0], self.recurrent_kernel_i)) f = self.recurrent_activation( x_f + K.dot(m * rec_dp_mask[1], self.recurrent_kernel_f)) c = f * c_tm1 + i * self.activation( x_c + K.dot(m * rec_dp_mask[2], self.recurrent_kernel_c)) o = self.recurrent_activation( x_o + K.dot(m * rec_dp_mask[3], self.recurrent_kernel_o)) h = o * self.activation(c) if 0 < self.dropout + self.recurrent_dropout: h._uses_learning_phase = True return h, [h, c]
def gram_matrix(x): assert K.ndim(x) == 3 features = K.batch_flatten(x) gram = K.dot(features, K.transpose(features)) return gram
def _step(self, x_tm1, h_tm1, c_tm1, v, u_i, u_f, u_o, u_c, w_i, w_f, w_c, w_o, w_x, v_i, v_f, v_c, v_o, b_i, b_f, b_c, b_o, b_x): #Inputs = output from previous time step, vector from encoder xi_t = K.dot(x_tm1, w_i) + K.dot(v, v_i) + b_i xf_t = K.dot(x_tm1, w_f) + K.dot(v, v_f) + b_f xc_t = K.dot(x_tm1, w_c) + K.dot(v, v_c) + b_c xo_t = K.dot(x_tm1, w_o) + K.dot(v, v_o) + b_o i_t = self.inner_activation(xi_t + K.dot(h_tm1, u_i)) f_t = self.inner_activation(xf_t + K.dot(h_tm1, u_f)) c_t = f_t * c_tm1 + i_t * self.activation(xc_t + K.dot(h_tm1, u_c)) o_t = self.inner_activation(xo_t + K.dot(h_tm1, u_o)) h_t = o_t * self.activation(c_t) x_t = K.dot(h_t, w_x) + b_x return x_t, h_t, c_t
def gram_matrix(x): features = backend.batch_flatten( backend.permute_dimensions(x, (2, 0, 1))) gram = backend.dot(features, backend.transpose(features)) return gram
def call(self, inputs, states, training=None): h_tm1 = states[0] # previous memory # x = inputs[:, :self.x_dim] wei = inputs[:, self.x_dim:self.x_dim + self.weight_dim] if 0 < self.dropout < 1 and self._dropout_mask is None: self._dropout_mask = _generate_dropout_mask(K.ones_like(x), self.dropout, training=training, count=4) if (0 < self.recurrent_dropout < 1 and self._recurrent_dropout_mask is None): self._recurrent_dropout_mask = _generate_dropout_mask( K.ones_like(h_tm1), self.recurrent_dropout, training=training, count=4) # dropout matrices for input units dp_mask = self._dropout_mask # dropout matrices for recurrent units rec_dp_mask = self._recurrent_dropout_mask self.implementation = 2 if self.implementation == 1: pass else: if 0. < self.dropout < 1.: x *= dp_mask[0] # inputs projected by all gate matrices at once matrix_x = K.dot(x, self.kernel) matrix_w = K.dot(wei, self.kernel_wei) if self.use_bias: # biases: bias_z_i, bias_r_i, bias_h_i matrix_x = K.bias_add(matrix_x, self.input_bias[:self.units * 3]) x_z = matrix_x[:, :self.units] x_r = matrix_x[:, self.units:2 * self.units] x_h = matrix_x[:, 2 * self.units:3 * self.units] x_w = matrix_w if 0. < self.recurrent_dropout < 1.: h_tm1 *= rec_dp_mask[0] if self.reset_after: # hidden state projected by all gate matrices at once matrix_inner = K.dot(h_tm1, self.recurrent_kernel) if self.use_bias: matrix_inner = K.bias_add(matrix_inner, self.recurrent_bias) else: # hidden state projected separately for update/reset and new matrix_inner = K.dot(h_tm1, self.recurrent_kernel[:, :2 * self.units]) recurrent_z = matrix_inner[:, :self.units] recurrent_r = matrix_inner[:, self.units:2 * self.units] z = self.recurrent_activation(x_z + recurrent_z) r = self.recurrent_activation(x_r + recurrent_r) if self.reset_after: recurrent_h = r * matrix_inner[:, 2 * self.units:3 * self.units] recurrent_w = matrix_inner[:, 2 * self.units:3 * self.units] else: recurrent_h = K.dot( r * h_tm1, self.recurrent_kernel[:, 2 * self.units:3 * self.units]) recurrent_w = K.dot(h_tm1, self.recurrent_kernel[:, 3 * self.units:]) w = self.recurrent_activation(x_w + recurrent_w) #w = self.recurrent_activation(x_w) #x_h = x_h * w hh = self.activation(x_h + recurrent_h) # previous and candidate state mixed by update gate h = (1 - w * z) * h_tm1 + (w * z) * hh if 0 < self.dropout + self.recurrent_dropout: if training is None: h._uses_learning_phase = True return h, [h]
def reshape(x, states): h = K.dot(x, self.W_h) + self.b_h return h, []
def dot_product(x, kernel): if K.backend() == 'tensorflow': return K.squeeze(K.dot(x, K.expand_dims(kernel)), axis=-1) else: return K.dot(x, kernel)
def _mlp(self, input_, weights, bias): act = input_ for w, b in zip(weights, bias): output = K.dot(act, w) + b act = self.activation(output) return output
def attention(self, pre_q, pre_v, pre_k, out_seq_len, d_model, attn_mask=None, training=None): """ Calculates the output of the attention once the affine transformations of the inputs are done. Here's the shapes of the arguments: :param pre_q: (batch_size, q_seq_len, num_heads, d_model // num_heads) :param pre_v: (batch_size, v_seq_len, num_heads, d_model // num_heads) :param pre_k: (batch_size, k_seq_len, num_heads, d_model // num_heads) :param out_seq_len: the length of the output sequence :param d_model: dimensionality of the model (by the paper) :param training: Passed by Keras. Should not be defined manually. Optional scalar tensor indicating if we're in training or inference phase. """ # shaping Q and V into (batch_size, num_heads, seq_len, d_model//heads) q = K.permute_dimensions(pre_q, [0, 2, 1, 3]) v = K.permute_dimensions(pre_v, [0, 2, 1, 3]) if self.compression_window_size is None: k_transposed = K.permute_dimensions(pre_k, [0, 2, 3, 1]) else: # Memory-compressed attention described in paper # "Generating Wikipedia by Summarizing Long Sequences" # (https://arxiv.org/pdf/1801.10198.pdf) # It compresses keys and values using 1D-convolution which reduces # the size of Q * K_transposed from roughly seq_len^2 # to convoluted_seq_len^2. If we use strided convolution with # window size = 3 and stride = 3, memory requirements of such # memory-compressed attention will be 9 times smaller than # that of the original version. if self.use_masking: raise NotImplementedError( "Masked memory-compressed attention has not " "been implemented yet") k = K.permute_dimensions(pre_k, [0, 2, 1, 3]) k, v = [ K.reshape( # Step 3: Return the result to its original dimensions # (batch_size, num_heads, seq_len, d_model//heads) K.bias_add( # Step 3: ... and add bias K.conv1d( # Step 2: we "compress" K and V using strided conv K.reshape( # Step 1: we reshape K and V to # (batch * num_heads, seq_len, d_model//heads) item, (-1, K.int_shape(item)[-2], d_model // self.num_heads)), kernel, strides=self.compression_window_size, padding='valid', data_format='channels_last'), bias, data_format='channels_last'), # new shape K.concatenate([ K.shape(item)[0], K.shape(item)[1], # shape: (batch_size, num_heads) [-1, d_model // self.num_heads] ])) # shape: (seq_len, n_model//num_heads) for item, kernel, bias in ((k, self.k_conv_kernel, self.k_conv_bias), (v, self.v_conv_kernel, self.v_conv_bias)) ] k_transposed = K.permute_dimensions(k, [0, 1, 3, 2]) # shaping K into (batch_size, num_heads, d_model//heads, seq_len) # for further matrix multiplication sqrt_d = K.sqrt(K.cast(d_model, dtype=K.floatx()) // self.num_heads) q_shape = K.shape(q) k_t_shape = K.shape(k_transposed) v_shape = K.shape(v) #q_shape = K.int_shape(q) #k_t_shape = K.int_shape(k_transposed) #v_shape = K.int_shape(v) # before performing batch_dot all tensors are being converted to 3D # shape (batch_size * num_heads, tar_seq_len, d_model//num_heads) to make sure batch_dot # performs identically on all backends attention_heads = K.reshape( K.batch_dot( self.apply_dropout_if_needed( K.softmax( # mask the attention for the prediction process #self.mask_attention_if_needed( self.mask_attention( # core scaled dot product K. batch_dot( # (batch_size * num_heads, tar_seq_len, src_seq_len) K.reshape( q, (-1, q_shape[-2], q_shape[-1]) ), # q_shape: (batch_size*num_heads, q_seq_len, d_model//heads) K.reshape( k_transposed, # k_transposed: (batch_size*num_heads, d_model//heads, k_seq_len) (-1, k_t_shape[-2], k_t_shape[-1]))) / sqrt_d, attn_mask)), training=training), K.reshape(v, (-1, v_shape[-2], v_shape[-1])) ), # shape: (batch_size * num_heads, v_seq_len, d_model//heads) (-1, self.num_heads, q_shape[-2], q_shape[-1])) # shape: (batch_size * seq_length, d_model) attention_heads_merged = K.reshape( # shape (batch_size, q_seq_length, num_heads, d_model // num_heads) to make sure batch_dot K.permute_dimensions(attention_heads, [0, 2, 1, 3]), (-1, d_model)) # shape: (batch_size, out_seq_len, d_model). Generally, out_seq_len should be q_seq_len attention_out = K.reshape( K.dot(attention_heads_merged, self.output_weights), (-1, out_seq_len, d_model)) return attention_out
def call(self, inputs, **kwargs): inputs = K.l2_normalize(inputs, -1) # input_l2norm output = K.dot(inputs, self.kernel) # cos = input_l2norm * W_l2norm return output
def call(self, x, mask=None): X = x[:, :, 0] * x[:, :, 1] Y = K.abs(x[:, :, 0] - x[:, :, 1]) z = K.dot(X, self.W_p) + K.dot(Y, self.W_m) return K.tanh(z) #+ self.b)
def gram_matrix(x): features = k.batch_flatten(k.permute_dimensions(x, (2, 0, 1))) gram = k.dot(features, k.transpose(features)) return gram
def _build_gram_matrix(self, x): features = K.batch_flatten(K.permute_dimensions(x, (2, 0, 1))) gram_matrix = K.dot(features, K.transpose(features)) return gram_matrix
def call(self, x, mask=None): output = K.dot(x, self.W) if self.bias: output += self.b return self.activation(output)
def compute_similarity(self, tensor_1, tensor_2): dot_product = K.sum(K.dot(tensor_1, self.weight_matrix) * tensor_2, axis=-1) return self.activation(dot_product + self.bias)
def a(x, states): output = K.dot(x, w_a) + b_a return output, []
def call(self, x, mask=None): return K.dot(x, self.W)
def gram_matrix(features): return K.dot(features, K.transpose(features))
def call(self, x): return [K.dot(x, self.kernel), K.dot(x, self.kernel)]
def _step(self, x_tm1, h_tm1, c_tm1, H, u_i, u_f, u_o, u_c, w_i, w_f, w_c, w_o, w_x, w_a, v_i, v_f, v_c, v_o, b_i, b_f, b_c, b_o, b_x, b_a): s_tm1 = K.repeat(c_tm1, self.input_length) e = H + s_tm1 def a(x, states): output = K.dot(x, w_a) + b_a return output, [] _, energy, _ = K.rnn(a, e, [], masking=False) energy = activations.get('linear')(energy) energy = K.permute_dimensions(energy, (2, 0, 1)) energy = energy[0] alpha = K.softmax(energy) alpha = K.repeat(alpha, self.input_dim) alpha = K.permute_dimensions(alpha, (0, 2 , 1)) weighted_H = H * alpha v = K.sum(weighted_H, axis=1) xi_t = K.dot(x_tm1, w_i) + K.dot(v, v_i) + b_i xf_t = K.dot(x_tm1, w_f) + K.dot(v, v_f) + b_f xc_t = K.dot(x_tm1, w_c) + K.dot(v, v_c) + b_c xo_t = K.dot(x_tm1, w_o) + K.dot(v, v_o) + b_o i_t = self.inner_activation(xi_t + K.dot(h_tm1, u_i)) f_t = self.inner_activation(xf_t + K.dot(h_tm1, u_f)) c_t = f_t * c_tm1 + i_t * self.activation(xc_t + K.dot(h_tm1, u_c)) o_t = self.inner_activation(xo_t + K.dot(h_tm1, u_o)) h_t = o_t * self.activation(c_t) x_t = K.dot(h_t, w_x) + b_x return x_t, h_t, c_t
def gram_matrix(x): print K.ndim(x), x.shape features = K.batch_flatten(K.permute_dimensions(x, (0, 3, 1, 2))) gram = K.dot(features, K.transpose(features)) return gram
def step(self, x, states): ( h_p, h_v, # 0:parent, 1:traversal x_type, # 2:treetype(ins/sub,left/right); ints of size (B,). \in {0,1,2,3} B_U, B_W) = states # 3:Udropoutmask, 4:Wdropoutmask #### matrix x has all 4 x computations in it ## per move this_Wx = self.W_x[x_type] ## B, I, 4*O matrix_x = K.batch_dot(x * B_W[0], this_Wx) + self.b_x x_zp = matrix_x[:, :self.output_dim] x_rp = matrix_x[:, self.output_dim:2 * self.output_dim] x_rv = matrix_x[:, 2 * self.output_dim:3 * self.output_dim] x_ih = matrix_x[:, 3 * self.output_dim:] #### matrix p has zp, rp; matrix v has zv, rv matrix_p = K.dot(h_p * B_U[0], self.U_p[:, :2 * self.output_dim]) # zp is for the parent unit update (resulting in child unit) inner_zp = matrix_p[:, :self.output_dim] z_p = self.inner_activation(x_zp + inner_zp) # rp is for gating to the intermediate unit of parent inner_rp = matrix_p[:, self.output_dim:2 * self.output_dim] r_p = self.inner_activation(x_rp + inner_rp) matrix_v = K.dot(h_v * B_U[0], self.U_v[:, :2 * self.output_dim]) # rv is for the intermediate gate on the traversal unit # this gets reused for both the parent's and its own intermediate inner_rv = matrix_v[:, self.output_dim:2 * self.output_dim] r_v = self.inner_activation(x_rv + inner_rv) # the actual recurrence calculations # h_p * U and h_v * U ; as gated by their r gates inner_hp = K.dot(r_p * h_p * B_U[0], self.U_p[:, 2 * self.output_dim:]) inner_hv = K.dot(r_v * h_v * B_U[0], self.U_v[:, 2 * self.output_dim:]) # h_c_tilde is the intermediate state h_c_tilde = self.activation(x_ih + inner_hp + inner_hv) # h_c is the new child state h_c = z_p * h_c_tilde + (1 - z_p) * h_p matrix_c = K.dot(h_c * B_U[0], self.U_c) + self.b_c hc_zv = matrix_c[:, :self.output_dim] hc_rv = matrix_c[:, self.output_dim:2 * self.output_dim] hc_ih = matrix_c[:, 2 * self.output_dim:] ### zv -> gate h_v and h_v_tilde ### rv -> gate h_v's contribution to h_v_tilde ### ih -> h_c's contribution to h_v_tilde # zv is for the traversal unit update. inner_zv = matrix_v[:, :self.output_dim] z_v = self.inner_activation(hc_zv + inner_zv) ## r_v is calculated with h_c rather than x r_v = self.inner_activation(hc_rv + inner_rv) inner_hvplus = K.dot(r_v * h_v * B_U[0], self.U_v[:, 2 * self.output_dim:]) h_vplus_tilde = self.activation(hc_ih + inner_hvplus) h_vplus = z_v * h_v + (1 - z_v) * h_vplus_tilde return h_c, h_vplus
def train_model(base_model: keras.Model, is_causal: bool, tasks_meta_data: List[TaskMetadata], pretrain_generator, finetune_generator, pretrain_epochs: int = 1, pretrain_optimizer='adam', pretrain_steps: int = 1000000, pretrain_callbacks=None, finetune_epochs: int = 1, finetune_optimizer='adam', finetune_steps: int = 10000, finetune_callbacks=None, verbose: int = 0, TPUStrategy=None): token_input = base_model.inputs[0] segment_input = base_model.inputs[1] position_input = base_model.inputs[2] uses_attn_mask = len(base_model.inputs) == 4 max_len = K.int_shape(base_model.inputs[0])[1] if uses_attn_mask: attention_mask_input = base_model.inputs[3] all_logits = [] all_tasks = {task.name: task for task in tasks_meta_data} task_nodes = {} sent_level_mask_inputs = [] assert len(all_tasks) == len(tasks_meta_data) for task in all_tasks.values(): task_loss_weight = Input(batch_shape=(None, 1), dtype='float32', name=task.name + '_loss_weight') if task.is_token_level: if task.name == 'lm': decoder = Lambda(lambda x: K.dot( x, K.transpose( base_model.get_layer('TokenEmbedding').weights[0])), name='lm_logits') else: decoder = Dense(units=task.num_classes, name=task.name + '_logits') logits = TimeDistributed(decoder, name=task.name + '_logits_time_distributed')(Dropout( task.dropout)(base_model.outputs[0])) task_target = Input(batch_shape=( None, max_len, ), dtype='int32', name=task.name + '_target_input') task_mask = Input(batch_shape=(None, max_len), dtype='int8' if TPUStrategy is None else 'int32', name=task.name + '_mask_input') task_loss = Lambda( lambda x: x[0] * masked_classification_loss(x[1], x[2], x[3]), name=task.name + '_loss')( [task_loss_weight, task_target, logits, task_mask]) else: task_mask = Input(batch_shape=(None, 1), dtype='int32', name=task.name + '_mask_input') decoder_input = sparse_gather(base_model.outputs[0], task_mask, task.name) logits = Dense(units=task.num_classes, name=task.name + '_logits')( Dropout(task.dropout)(decoder_input)) task_target = Input(batch_shape=(None, 1), dtype='int32', name=task.name + '_target_input') task_loss = Lambda( lambda x: x[0] * classification_loss(x[1], x[2]), name=task.name + '_loss')( [task_loss_weight, task_target, logits]) sent_level_mask_inputs.append(task_mask) task_nodes[task.name] = { 'target': task_target, 'mask': task_mask, 'loss_weight': task_loss_weight, 'loss': task_loss, } all_logits.append(logits) def get_generator(sentence_generator: Generator[SentenceBatch, None, None], is_pretrain: bool): for i, batch in enumerate(sentence_generator): batch_size, seq_len = batch.tokens.shape x = [ batch.tokens, batch.segments, generate_pos_ids(batch_size, max_len) ] y = [] if uses_attn_mask: x.append(create_attention_mask(batch.padding_mask, is_causal)) for task_name in task_nodes.keys(): if is_pretrain: cond = all_tasks[ task_name].weight_scheduler.active_in_pretrain else: cond = all_tasks[ task_name].weight_scheduler.active_in_finetune if cond: if task_name in batch.sentence_classification: task_data_batch = batch.sentence_classification[ task_name] else: task_data_batch = batch.token_classification[task_name] x.append(task_data_batch.target) if all_tasks[task_name].is_token_level: x.append(task_data_batch.target_mask) else: x.append( (task_data_batch.target_mask + np.arange(batch_size) * seq_len).astype(np.int32)) x.append( np.repeat( np.array([ all_tasks[task_name].weight_scheduler.get( is_pretrain, i) ]), batch_size, 0)) y.append(np.repeat(np.array([0.0]), batch_size, 0)) yield x, y def train_step(is_pretrain: bool): _inputs = [token_input, segment_input, position_input] _outputs = [] if uses_attn_mask: _inputs.append(attention_mask_input) for task_name in task_nodes.keys(): if is_pretrain: cond = all_tasks[task_name].weight_scheduler.active_in_pretrain else: cond = all_tasks[task_name].weight_scheduler.active_in_finetune if cond: _inputs.append(task_nodes[task_name]['target']) _inputs.append(task_nodes[task_name]['mask']) _inputs.append(task_nodes[task_name]['loss_weight']) _outputs.append(task_nodes[task_name]['loss']) _generator = get_generator( pretrain_generator if is_pretrain else finetune_generator, is_pretrain) _model = keras.Model(inputs=_inputs, outputs=_outputs) if TPUStrategy is not None: ''' Create TPUStrategy like this: tpu_address = 'grpc://' + os.environ['COLAB_TPU_ADDR'] TPUStrategy = tf.contrib.tpu.TPUDistributionStrategy( tf.contrib.cluster_resolver.TPUClusterResolver(tpu=tpu_address) ) ''' _model = tf.contrib.tpu.keras_to_tpu_model(_model, strategy=TPUStrategy) _model.compile( pretrain_optimizer if is_pretrain else finetune_optimizer, loss=pass_through_loss) _model.fit_generator( _generator, steps_per_epoch=pretrain_steps if is_pretrain else finetune_steps, verbose=verbose, callbacks=pretrain_callbacks if is_pretrain else finetune_callbacks, shuffle=False, epochs=pretrain_epochs if is_pretrain else finetune_epochs) if pretrain_generator is not None: train_step(True) if finetune_generator is not None: train_step(False) ret_model = keras.Model(inputs=base_model.inputs + sent_level_mask_inputs, outputs=all_logits) if TPUStrategy is not None: ret_model = tf.contrib.tpu.keras_to_tpu_model(ret_model, strategy=TPUStrategy) # Compile for TPU model predicting for the first time. Also you can have a new compile for training use after this ret_model.compile(finetune_optimizer, loss=pass_through_loss) return ret_model
def get_Gram_matrix(F): G = K.dot(F, K.transpose(F)) return G
def dot_product(x, kernel): return K.squeeze(K.dot(x, K.expand_dims(kernel)), axis=-1)
def step(self, x, states): ytm, stm = states # repeat the hidden state to the length of the sequence _stm = K.repeat(stm, self.timesteps) # now multiplty the weight matrix with the repeated hidden state _Wxstm = K.dot(_stm, self.W_a) # calculate the attention probabilities # this relates how much other timesteps contributed to this one. et = K.dot(activations.tanh(_Wxstm + self._uxpb), K.expand_dims(self.V_a)) at = K.exp(et) at_sum = K.sum(at, axis=1) at_sum_repeated = K.repeat(at_sum, self.timesteps) at /= at_sum_repeated # vector of size (batchsize, timesteps, 1) # calculate the context vector context = K.squeeze(K.batch_dot(at, self.x_seq, axes=1), axis=1) # ~~~> calculate new hidden state # first calculate the "r" gate: rt = activations.sigmoid( K.dot(ytm, self.W_r) + K.dot(stm, self.U_r) + K.dot(context, self.C_r) + self.b_r) # now calculate the "z" gate zt = activations.sigmoid( K.dot(ytm, self.W_z) + K.dot(stm, self.U_z) + K.dot(context, self.C_z) + self.b_z) # calculate the proposal hidden state: s_tp = activations.tanh( K.dot(ytm, self.W_p) + K.dot((rt * stm), self.U_p) + K.dot(context, self.C_p) + self.b_p) # new hidden state: st = (1 - zt) * stm + zt * s_tp yt = activations.softmax( K.dot(ytm, self.W_o) + K.dot(stm, self.U_o) + K.dot(context, self.C_o) + self.b_o) if self.return_probabilities: return at, [yt, st] else: return yt, [yt, st]
def gram_matrix(x): flatten = K.batch_flatten(K.permute_dimensions(x, (2, 0, 1))) gram_m = K.dot(flatten, K.transpose(flatten)) return gram_m
def get_constants(self, enc_output, constants): constants.append(K.dot(enc_output,self.W1)) constants.append(enc_output) return constants
def loss2(self, y_true, y_pred): sigma = K.cast_to_floatx(np.diag(np.full((2 * self.F, ), 0.005))) return (0.5) * K.dot( K.dot((y_true - y_pred), tf.matrix_inverse(sigma)), tf.transpose(y_true - y_pred))