def forward(self, A, a_res): """ :param A: 1D: batch, 2D: n_agents, 3D: dim_h :param a_res: 1D: batch, 2D: dim_h :return: h_after: 1D: n_queries, 2D: n_cands-1, 3D: dim_h """ # 1D: batch, 2D: n_agents, 3D: dim_h M = tanh( T.dot(A, self.W1_c) + T.dot(a_res, self.W1_h).dimshuffle(0, 'x', 1)) # 1D: batch, 2D: n_agents u = T.dot(M, self.w) # 1D: batch, 2D: n_agents, 3D: 1 alpha = T.nnet.softmax(u) alpha = alpha.dimshuffle((0, 1, 'x')) # 1D: batch, 2D: dim_h r = T.sum(A * alpha, axis=1) # 1D: batch, 2D: dim_h h = relu(T.dot(r, self.W2_r)) return h
def linear_activation_forward(A_prev, W, b, activation): """ Implement the forward propagation for the LINEAR->ACTIVATION layer Arguments: A_prev -- activations from previous layer (or input data): (size of previous layer, number of examples) W -- weights matrix: numpy array of shape (size of current layer, size of previous layer) b -- bias vector, numpy array of shape (size of the current layer, 1) activation -- the activation to be used in this layer, stored as a text string: "sigmoid" or "relu" Returns: A -- the output of the activation function, also called the post-activation value cache -- a python dictionary containing "linear_cache" and "activation_cache"; stored for computing the backward pass efficiently """ if activation == "sigmoid": # Inputs: "A_prev, W, b". Outputs: "A, activation_cache". Z, linear_cache = linear_forward(A_prev, W, b) A, activation_cache = sigmoid(Z) elif activation == "relu": # Inputs: "A_prev, W, b". Outputs: "A, activation_cache". Z, linear_cache = linear_forward(A_prev, W, b) A, activation_cache = relu(Z) elif activation == "tanh": # Inputs: "A_prev, W, b". Outputs: "A, activation_cache". Z, linear_cache = linear_forward(A_prev, W, b) A, activation_cache = tanh(Z) assert (A.shape == (W.shape[0], A_prev.shape[1])) cache = (linear_cache, activation_cache) return A, cache
def forward_second_order(self, A, a_res): """ :param A: 1D: batch, 2D: n_agents, 3D: dim_h :param a_res: 1D: batch, 2D: dim_h :return: h_after: 1D: n_queries, 2D: n_cands-1, 3D: dim_h """ # 1D: batch, 2D: n_agents, 3D: dim_h M = tanh( T.dot(A, self.W1_c) + T.dot(a_res, self.W1_h).dimshuffle(0, 'x', 1)) # 1D: batch, 2D: n_agents M_a = T.dot(M, self.w) # 1D: batch, 2D: n_agents, 3D: 1 alpha = T.nnet.softmax(M_a) alpha = alpha.dimshuffle((0, 1, 'x')) # 1D: batch, 2D: dim_h r_a = T.sum(A * alpha, axis=1) # 1D: n_agents, 2D: dim_h w = self.w.dimshuffle(('x', 'x', 0)) w = T.repeat(w, M_a.shape[1], axis=1) # 1D: batch, 2D: n_agents, 3D: dim_h M_a = M_a.dimshuffle((0, 1, 'x')) M_a = T.repeat(M_a, M.shape[2], axis=2) M_b = M - T.sum(M_a * w) / self.w.norm(2) beta = T.nnet.softmax(T.dot(M_b, self.w_b)) beta = beta.dimshuffle((0, 1, 'x')) # 1D: batch, 2D: dim_h r_b = T.sum(A * beta, axis=1) # 1D: batch, 2D: dim_h h = relu(T.dot(T.concatenate([r_a, r_b], axis=1), self.W2_r)) return h
def forward_double(self, A, a_res): """ :param A: 1D: batch, 2D: n_agents, 3D: dim_h :param a_res: 1D: batch, 2D: dim_h :return: h_after: 1D: n_queries, 2D: n_cands-1, 3D: dim_h """ # 1D: batch, 2D: n_agents, 3D: dim_h M = tanh( T.dot(A, self.W1_c) + T.dot(a_res, self.W1_h).dimshuffle(0, 'x', 1)) # 1D: batch, 2D: n_agents M_a = T.dot(M, self.w) # 1D: batch, 2D: dim_h M_b = T.max(M, axis=1) M_b = T.dot(M_b, self.W_m) # 1D: batch, 2D: n_agents, 3D: 1 alpha = T.nnet.softmax(M_a) alpha = alpha.dimshuffle((0, 1, 'x')) # 1D: batch, 2D: 1, 3D: dim_h beta = T.nnet.softmax(M_b) beta = beta.dimshuffle((0, 'x', 1)) # 1D: batch, 2D: n_agents, 3D: dim_h # gamma = - (T.log(alpha) + T.log(beta)) gamma = alpha * beta # 1D: batch, 2D: dim_h r = T.sum(A * gamma, axis=1) # 1D: batch, 2D: dim_h h = relu(T.dot(r, self.W2_r)) return h
def __init__(self, x_span, x_word, x_ctx, x_dist, x_slen, y, init_emb, n_vocab, dim_w_p, dim_d, dim_h, L2_reg): """ :param x_span: 1D: batch, 2D: limit * 2 (10); elem=word id :param x_word: 1D: batch, 2D: 4 (m_first, m_last, a_first, a_last); elem=word id :param x_ctx : 1D: batch, 2D: window * 2 * 2 (20); elem=word id :param x_dist: 1D: batch; 2D: 2; elem=[sent dist, ment dist] :param x_slen: 1D: batch; 2D: 3; elem=[m_span_len, a_span_len, head_match] :param y : 1D: batch """ self.input = [x_span, x_word, x_ctx, x_dist, x_slen, y] self.x_span = x_span self.x_word = x_word self.x_ctx = x_ctx self.x_dist = x_dist self.x_slen = x_slen self.y = y """ Dimensions """ dim_w_a = dim_w_p / 5 dim_x_a = dim_w_a * (5 + 2 + 2 + 1) dim_x_p = dim_w_p * (10 + 4 + 4 + 2 + 3) + dim_x_a batch = y.shape[0] """ Hyper Parameters for Cost Function """ self.a1 = 0.5 self.a2 = 1.2 self.a3 = 1.0 """ Params """ if init_emb is None: self.W_a_w = theano.shared(sample_weights(n_vocab, dim_w_a)) self.W_p_w = theano.shared(sample_weights(n_vocab, dim_w_p)) else: self.W_a_w = theano.shared(init_emb) self.W_p_w = theano.shared(init_emb) self.W_a_l = theano.shared(sample_weights(5, dim_w_a)) self.W_a_o = theano.shared(sample_weights(dim_x_a, 1)) self.W_p_d = theano.shared(sample_weights(dim_d, dim_w_p)) self.W_p_l = theano.shared(sample_weights(7, dim_w_p)) self.W_p_h = theano.shared(sample_weights(dim_x_p, dim_h)) self.W_p_o = theano.shared(sample_weights(dim_h)) self.params = [self.W_p_d, self.W_p_l, self.W_a_l, self.W_p_h, self.W_p_o, self.W_a_o] """ Anaphoric Layer """ x_vec_a = T.concatenate( [x_span[0][: x_span.shape[1] / 2], x_word[0][: x_word.shape[1] / 2], x_ctx[0][: x_ctx.shape[1] / 2]] ) x_a_w = self.W_a_w[x_vec_a] # 1D: batch, 2D: (limit * 1 + 2 + ctx), 3D: dim_w_a x_a_l = self.W_a_l[x_slen[0][0]] # 1D: dim_w_a h_a = T.concatenate([x_a_w.flatten(), x_a_l]) """ Pair Layer """ x_p_w_in = T.concatenate([x_span, x_word, x_ctx], 1).flatten() # 1D: batch * (limit * 2 + 4 + 20) x_p_w = self.W_p_w[x_p_w_in] # 1D: batch, 2D: (limit * 2 + 4 + ctx * 2), 3D: dim_w x_p_l = self.W_p_l[x_slen] # 1D: batch, 2D: 3, 3D: dim_w x_p_d = self.W_p_d[x_dist] # 1D: batch, 2D: 2, 3D: dim_w h_p = T.concatenate([x_p_w.reshape((batch, -1)), x_p_d.reshape((batch, -1)), x_p_l.reshape((batch, -1))], 1) g_p = tanh(T.dot(T.concatenate([h_p, T.repeat(h_a.dimshuffle("x", 0), batch, 0)], 1), self.W_p_h)) """ Output Layer """ p_y_a = T.dot(h_a, self.W_a_o) # p_y_a: 1D: 1; elem=scalar p_y_p = T.dot(g_p, self.W_p_o) # p_y_p: 1D: batch p_y = T.concatenate([p_y_a, p_y_p]) """ Label Set """ y_0 = T.switch(T.sum(y), 0, 1) # y_0: 1 if the mention is non-anaph else 0 y_all = T.concatenate([y_0.dimshuffle("x"), y]) """ Predicts """ self.y_hat = T.argmax(p_y) self.p_y_hat = p_y[T.argmax(p_y - T.min(p_y) * y_all)] """ Cost Function """ self.nll = T.max(self.miss_cost(T.arange(y_all.shape[0]), y_all) * (1 + p_y - self.p_y_hat)) self.cost = self.nll + L2_reg * L2_sqr(params=self.params) / 2 """ Optimization """ self.updates = sgd_w(self.cost, self.params, self.W_p_w, x_p_w, self.W_a_w, x_a_w) """ Check Results """ self.total_p = T.switch(self.y_hat, 1, 0) self.total_r = 1 - y_0 self.correct = y_all[self.y_hat] self.correct_t = T.switch(self.correct, T.switch(y_0, 0, 1), 0) self.correct_f = T.switch(self.correct, T.switch(y_0, 1, 0), 0)
def __init__(self, x_span, x_word, x_ctx, x_dist, x_slen, y, init_emb, n_vocab, dim_w_p, dim_d, dim_h, L2_reg): """ :param x_span: 1D: batch, 2D: limit * 2 (10); elem=word id :param x_word: 1D: batch, 2D: 4 (m_first, m_last, a_first, a_last); elem=word id :param x_ctx : 1D: batch, 2D: window * 2 * 2 (20); elem=word id :param x_dist: 1D: batch; 2D: 2; elem=[sent dist, ment dist] :param x_slen: 1D: batch; 2D: 3; elem=[m_span_len, a_span_len, head_match] :param y : 1D: batch """ self.input = [x_span, x_word, x_ctx, x_dist, x_slen, y] self.x_span = x_span self.x_word = x_word self.x_ctx = x_ctx self.x_dist = x_dist self.x_slen = x_slen self.y = y """ Dimensions """ dim_w_a = dim_w_p / 5 dim_x_a = dim_w_a * (5 + 2 + 2 + 1) dim_x_p = dim_w_p * (10 + 4 + 4 + 2 + 3) + dim_x_a batch = y.shape[0] """ Hyper Parameters for Cost Function """ self.a1 = 0.5 self.a2 = 1.2 self.a3 = 1. """ Params """ if init_emb is None: self.W_a_w = theano.shared(sample_weights(n_vocab, dim_w_a)) self.W_p_w = theano.shared(sample_weights(n_vocab, dim_w_p)) else: self.W_a_w = theano.shared(init_emb) self.W_p_w = theano.shared(init_emb) self.W_a_l = theano.shared(sample_weights(5, dim_w_a)) self.W_a_o = theano.shared(sample_weights(dim_x_a, 1)) self.W_p_d = theano.shared(sample_weights(dim_d, dim_w_p)) self.W_p_l = theano.shared(sample_weights(7, dim_w_p)) self.W_p_h = theano.shared(sample_weights(dim_x_p, dim_h)) self.W_p_o = theano.shared(sample_weights(dim_h)) self.params = [self.W_p_d, self.W_p_l, self.W_a_l, self.W_p_h, self.W_p_o, self.W_a_o] """ Anaphoric Layer """ x_vec_a = T.concatenate([x_span[0][:x_span.shape[1]/2], x_word[0][:x_word.shape[1]/2], x_ctx[0][:x_ctx.shape[1]/2]]) x_a_w = self.W_a_w[x_vec_a] # 1D: batch, 2D: (limit * 1 + 2 + ctx), 3D: dim_w_a x_a_l = self.W_a_l[x_slen[0][0]] # 1D: dim_w_a h_a = T.concatenate([x_a_w.flatten(), x_a_l]) """ Pair Layer """ x_p_w_in = T.concatenate([x_span, x_word, x_ctx], 1).flatten() # 1D: batch * (limit * 2 + 4 + 20) x_p_w = self.W_p_w[x_p_w_in] # 1D: batch, 2D: (limit * 2 + 4 + ctx * 2), 3D: dim_w x_p_l = self.W_p_l[x_slen] # 1D: batch, 2D: 3, 3D: dim_w x_p_d = self.W_p_d[x_dist] # 1D: batch, 2D: 2, 3D: dim_w h_p = T.concatenate([x_p_w.reshape((batch, -1)), x_p_d.reshape((batch, -1)), x_p_l.reshape((batch, -1))], 1) g_p = tanh(T.dot(T.concatenate([h_p, T.repeat(h_a.dimshuffle('x', 0), batch, 0)], 1), self.W_p_h)) """ Output Layer """ p_y_a = T.dot(h_a, self.W_a_o) # p_y_a: 1D: 1; elem=scalar p_y_p = T.dot(g_p, self.W_p_o) # p_y_p: 1D: batch p_y = T.concatenate([p_y_a, p_y_p]) """ Label Set """ y_0 = T.switch(T.sum(y), 0, 1) # y_0: 1 if the mention is non-anaph else 0 y_all = T.concatenate([y_0.dimshuffle('x'), y]) """ Predicts """ self.y_hat = T.argmax(p_y) self.p_y_hat = p_y[T.argmax(p_y - T.min(p_y) * y_all)] """ Cost Function """ self.nll = T.max(self.miss_cost(T.arange(y_all.shape[0]), y_all) * (1 + p_y - self.p_y_hat)) self.cost = self.nll + L2_reg * L2_sqr(params=self.params) / 2 """ Optimization """ self.updates = sgd_w(self.cost, self.params, self.W_p_w, x_p_w, self.W_a_w, x_a_w) """ Check Results """ self.total_p = T.switch(self.y_hat, 1, 0) self.total_r = 1 - y_0 self.correct = y_all[self.y_hat] self.correct_t = T.switch(self.correct, T.switch(y_0, 0, 1), 0) self.correct_f = T.switch(self.correct, T.switch(y_0, 1, 0), 0)