Пример #1
0
    def forward(self, A, a_res):
        """
        :param A: 1D: batch, 2D: n_agents, 3D: dim_h
        :param a_res: 1D: batch, 2D: dim_h
        :return: h_after: 1D: n_queries, 2D: n_cands-1, 3D: dim_h
        """

        # 1D: batch, 2D: n_agents, 3D: dim_h
        M = tanh(
            T.dot(A, self.W1_c) +
            T.dot(a_res, self.W1_h).dimshuffle(0, 'x', 1))

        # 1D: batch, 2D: n_agents
        u = T.dot(M, self.w)

        # 1D: batch, 2D: n_agents, 3D: 1
        alpha = T.nnet.softmax(u)
        alpha = alpha.dimshuffle((0, 1, 'x'))

        # 1D: batch, 2D: dim_h
        r = T.sum(A * alpha, axis=1)

        # 1D: batch, 2D: dim_h
        h = relu(T.dot(r, self.W2_r))
        return h
Пример #2
0
def linear_activation_forward(A_prev, W, b, activation):
    """
    Implement the forward propagation for the LINEAR->ACTIVATION layer

    Arguments:
    A_prev -- activations from previous layer (or input data): (size of previous layer, number of examples)
    W -- weights matrix: numpy array of shape (size of current layer, size of previous layer)
    b -- bias vector, numpy array of shape (size of the current layer, 1)
    activation -- the activation to be used in this layer, stored as a text string: "sigmoid" or "relu"

    Returns:
    A -- the output of the activation function, also called the post-activation value
    cache -- a python dictionary containing "linear_cache" and "activation_cache";
             stored for computing the backward pass efficiently
    """

    if activation == "sigmoid":
        # Inputs: "A_prev, W, b". Outputs: "A, activation_cache".
        Z, linear_cache = linear_forward(A_prev, W, b)
        A, activation_cache = sigmoid(Z)

    elif activation == "relu":
        # Inputs: "A_prev, W, b". Outputs: "A, activation_cache".
        Z, linear_cache = linear_forward(A_prev, W, b)
        A, activation_cache = relu(Z)

    elif activation == "tanh":
        # Inputs: "A_prev, W, b". Outputs: "A, activation_cache".
        Z, linear_cache = linear_forward(A_prev, W, b)
        A, activation_cache = tanh(Z)

    assert (A.shape == (W.shape[0], A_prev.shape[1]))
    cache = (linear_cache, activation_cache)

    return A, cache
Пример #3
0
    def forward_second_order(self, A, a_res):
        """
        :param A: 1D: batch, 2D: n_agents, 3D: dim_h
        :param a_res: 1D: batch, 2D: dim_h
        :return: h_after: 1D: n_queries, 2D: n_cands-1, 3D: dim_h
        """

        # 1D: batch, 2D: n_agents, 3D: dim_h
        M = tanh(
            T.dot(A, self.W1_c) +
            T.dot(a_res, self.W1_h).dimshuffle(0, 'x', 1))

        # 1D: batch, 2D: n_agents
        M_a = T.dot(M, self.w)

        # 1D: batch, 2D: n_agents, 3D: 1
        alpha = T.nnet.softmax(M_a)
        alpha = alpha.dimshuffle((0, 1, 'x'))

        # 1D: batch, 2D: dim_h
        r_a = T.sum(A * alpha, axis=1)

        # 1D: n_agents, 2D: dim_h
        w = self.w.dimshuffle(('x', 'x', 0))
        w = T.repeat(w, M_a.shape[1], axis=1)

        # 1D: batch, 2D: n_agents, 3D: dim_h
        M_a = M_a.dimshuffle((0, 1, 'x'))
        M_a = T.repeat(M_a, M.shape[2], axis=2)
        M_b = M - T.sum(M_a * w) / self.w.norm(2)

        beta = T.nnet.softmax(T.dot(M_b, self.w_b))
        beta = beta.dimshuffle((0, 1, 'x'))

        # 1D: batch, 2D: dim_h
        r_b = T.sum(A * beta, axis=1)

        # 1D: batch, 2D: dim_h
        h = relu(T.dot(T.concatenate([r_a, r_b], axis=1), self.W2_r))
        return h
Пример #4
0
    def forward_double(self, A, a_res):
        """
        :param A: 1D: batch, 2D: n_agents, 3D: dim_h
        :param a_res: 1D: batch, 2D: dim_h
        :return: h_after: 1D: n_queries, 2D: n_cands-1, 3D: dim_h
        """

        # 1D: batch, 2D: n_agents, 3D: dim_h
        M = tanh(
            T.dot(A, self.W1_c) +
            T.dot(a_res, self.W1_h).dimshuffle(0, 'x', 1))

        # 1D: batch, 2D: n_agents
        M_a = T.dot(M, self.w)

        # 1D: batch, 2D: dim_h
        M_b = T.max(M, axis=1)
        M_b = T.dot(M_b, self.W_m)

        # 1D: batch, 2D: n_agents, 3D: 1
        alpha = T.nnet.softmax(M_a)
        alpha = alpha.dimshuffle((0, 1, 'x'))

        # 1D: batch, 2D: 1, 3D: dim_h
        beta = T.nnet.softmax(M_b)
        beta = beta.dimshuffle((0, 'x', 1))

        # 1D: batch, 2D: n_agents, 3D: dim_h
        #        gamma = - (T.log(alpha) + T.log(beta))
        gamma = alpha * beta

        # 1D: batch, 2D: dim_h
        r = T.sum(A * gamma, axis=1)

        # 1D: batch, 2D: dim_h
        h = relu(T.dot(r, self.W2_r))
        return h
    def __init__(self, x_span, x_word, x_ctx, x_dist, x_slen, y, init_emb, n_vocab, dim_w_p, dim_d, dim_h, L2_reg):
        """
        :param x_span: 1D: batch, 2D: limit * 2 (10); elem=word id
        :param x_word: 1D: batch, 2D: 4 (m_first, m_last, a_first, a_last); elem=word id
        :param x_ctx : 1D: batch, 2D: window * 2 * 2 (20); elem=word id
        :param x_dist: 1D: batch; 2D: 2; elem=[sent dist, ment dist]
        :param x_slen: 1D: batch; 2D: 3; elem=[m_span_len, a_span_len, head_match]
        :param y     : 1D: batch
        """

        self.input = [x_span, x_word, x_ctx, x_dist, x_slen, y]
        self.x_span = x_span
        self.x_word = x_word
        self.x_ctx = x_ctx
        self.x_dist = x_dist
        self.x_slen = x_slen
        self.y = y

        """ Dimensions """
        dim_w_a = dim_w_p / 5
        dim_x_a = dim_w_a * (5 + 2 + 2 + 1)
        dim_x_p = dim_w_p * (10 + 4 + 4 + 2 + 3) + dim_x_a
        batch = y.shape[0]

        """ Hyper Parameters for Cost Function """
        self.a1 = 0.5
        self.a2 = 1.2
        self.a3 = 1.0

        """ Params """
        if init_emb is None:
            self.W_a_w = theano.shared(sample_weights(n_vocab, dim_w_a))
            self.W_p_w = theano.shared(sample_weights(n_vocab, dim_w_p))
        else:
            self.W_a_w = theano.shared(init_emb)
            self.W_p_w = theano.shared(init_emb)

        self.W_a_l = theano.shared(sample_weights(5, dim_w_a))
        self.W_a_o = theano.shared(sample_weights(dim_x_a, 1))

        self.W_p_d = theano.shared(sample_weights(dim_d, dim_w_p))
        self.W_p_l = theano.shared(sample_weights(7, dim_w_p))
        self.W_p_h = theano.shared(sample_weights(dim_x_p, dim_h))
        self.W_p_o = theano.shared(sample_weights(dim_h))

        self.params = [self.W_p_d, self.W_p_l, self.W_a_l, self.W_p_h, self.W_p_o, self.W_a_o]

        """ Anaphoric Layer """
        x_vec_a = T.concatenate(
            [x_span[0][: x_span.shape[1] / 2], x_word[0][: x_word.shape[1] / 2], x_ctx[0][: x_ctx.shape[1] / 2]]
        )

        x_a_w = self.W_a_w[x_vec_a]  # 1D: batch, 2D: (limit * 1 + 2 + ctx), 3D: dim_w_a
        x_a_l = self.W_a_l[x_slen[0][0]]  # 1D: dim_w_a
        h_a = T.concatenate([x_a_w.flatten(), x_a_l])

        """ Pair Layer """
        x_p_w_in = T.concatenate([x_span, x_word, x_ctx], 1).flatten()  # 1D: batch * (limit * 2 + 4 + 20)
        x_p_w = self.W_p_w[x_p_w_in]  # 1D: batch, 2D: (limit * 2 + 4 + ctx * 2), 3D: dim_w
        x_p_l = self.W_p_l[x_slen]  # 1D: batch, 2D: 3, 3D: dim_w
        x_p_d = self.W_p_d[x_dist]  # 1D: batch, 2D: 2, 3D: dim_w
        h_p = T.concatenate([x_p_w.reshape((batch, -1)), x_p_d.reshape((batch, -1)), x_p_l.reshape((batch, -1))], 1)
        g_p = tanh(T.dot(T.concatenate([h_p, T.repeat(h_a.dimshuffle("x", 0), batch, 0)], 1), self.W_p_h))

        """ Output Layer """
        p_y_a = T.dot(h_a, self.W_a_o)  # p_y_a: 1D: 1; elem=scalar
        p_y_p = T.dot(g_p, self.W_p_o)  # p_y_p: 1D: batch
        p_y = T.concatenate([p_y_a, p_y_p])

        """ Label Set """
        y_0 = T.switch(T.sum(y), 0, 1)  # y_0: 1 if the mention is non-anaph else 0
        y_all = T.concatenate([y_0.dimshuffle("x"), y])

        """ Predicts """
        self.y_hat = T.argmax(p_y)
        self.p_y_hat = p_y[T.argmax(p_y - T.min(p_y) * y_all)]

        """ Cost Function """
        self.nll = T.max(self.miss_cost(T.arange(y_all.shape[0]), y_all) * (1 + p_y - self.p_y_hat))
        self.cost = self.nll + L2_reg * L2_sqr(params=self.params) / 2

        """ Optimization """
        self.updates = sgd_w(self.cost, self.params, self.W_p_w, x_p_w, self.W_a_w, x_a_w)

        """ Check Results """
        self.total_p = T.switch(self.y_hat, 1, 0)
        self.total_r = 1 - y_0
        self.correct = y_all[self.y_hat]
        self.correct_t = T.switch(self.correct, T.switch(y_0, 0, 1), 0)
        self.correct_f = T.switch(self.correct, T.switch(y_0, 1, 0), 0)
Пример #6
0
    def __init__(self, x_span, x_word, x_ctx, x_dist, x_slen, y, init_emb, n_vocab, dim_w_p, dim_d, dim_h, L2_reg):
        """
        :param x_span: 1D: batch, 2D: limit * 2 (10); elem=word id
        :param x_word: 1D: batch, 2D: 4 (m_first, m_last, a_first, a_last); elem=word id
        :param x_ctx : 1D: batch, 2D: window * 2 * 2 (20); elem=word id
        :param x_dist: 1D: batch; 2D: 2; elem=[sent dist, ment dist]
        :param x_slen: 1D: batch; 2D: 3; elem=[m_span_len, a_span_len, head_match]
        :param y     : 1D: batch
        """

        self.input  = [x_span, x_word, x_ctx, x_dist, x_slen, y]
        self.x_span = x_span
        self.x_word = x_word
        self.x_ctx  = x_ctx
        self.x_dist = x_dist
        self.x_slen = x_slen
        self.y      = y

        """ Dimensions """
        dim_w_a = dim_w_p / 5
        dim_x_a = dim_w_a * (5 + 2 + 2 + 1)
        dim_x_p = dim_w_p * (10 + 4 + 4 + 2 + 3) + dim_x_a
        batch = y.shape[0]

        """ Hyper Parameters for Cost Function """
        self.a1 = 0.5
        self.a2 = 1.2
        self.a3 = 1.

        """ Params """
        if init_emb is None:
            self.W_a_w = theano.shared(sample_weights(n_vocab, dim_w_a))
            self.W_p_w = theano.shared(sample_weights(n_vocab, dim_w_p))
        else:
            self.W_a_w = theano.shared(init_emb)
            self.W_p_w = theano.shared(init_emb)

        self.W_a_l = theano.shared(sample_weights(5, dim_w_a))
        self.W_a_o = theano.shared(sample_weights(dim_x_a, 1))

        self.W_p_d = theano.shared(sample_weights(dim_d, dim_w_p))
        self.W_p_l = theano.shared(sample_weights(7, dim_w_p))
        self.W_p_h = theano.shared(sample_weights(dim_x_p, dim_h))
        self.W_p_o = theano.shared(sample_weights(dim_h))

        self.params = [self.W_p_d, self.W_p_l, self.W_a_l, self.W_p_h, self.W_p_o, self.W_a_o]

        """ Anaphoric Layer """
        x_vec_a = T.concatenate([x_span[0][:x_span.shape[1]/2],
                                 x_word[0][:x_word.shape[1]/2],
                                 x_ctx[0][:x_ctx.shape[1]/2]])

        x_a_w = self.W_a_w[x_vec_a]       # 1D: batch, 2D: (limit * 1 + 2 + ctx), 3D: dim_w_a
        x_a_l = self.W_a_l[x_slen[0][0]]  # 1D: dim_w_a
        h_a = T.concatenate([x_a_w.flatten(), x_a_l])

        """ Pair Layer """
        x_p_w_in = T.concatenate([x_span, x_word, x_ctx], 1).flatten()  # 1D: batch * (limit * 2 + 4 + 20)
        x_p_w = self.W_p_w[x_p_w_in]  # 1D: batch, 2D: (limit * 2 + 4 + ctx * 2), 3D: dim_w
        x_p_l = self.W_p_l[x_slen]    # 1D: batch, 2D: 3, 3D: dim_w
        x_p_d = self.W_p_d[x_dist]    # 1D: batch, 2D: 2, 3D: dim_w
        h_p = T.concatenate([x_p_w.reshape((batch, -1)), x_p_d.reshape((batch, -1)), x_p_l.reshape((batch, -1))], 1)
        g_p = tanh(T.dot(T.concatenate([h_p, T.repeat(h_a.dimshuffle('x', 0), batch, 0)], 1), self.W_p_h))

        """ Output Layer """
        p_y_a = T.dot(h_a, self.W_a_o)  # p_y_a: 1D: 1; elem=scalar
        p_y_p = T.dot(g_p, self.W_p_o)  # p_y_p: 1D: batch
        p_y = T.concatenate([p_y_a, p_y_p])

        """ Label Set """
        y_0 = T.switch(T.sum(y), 0, 1)  # y_0: 1 if the mention is non-anaph else 0
        y_all = T.concatenate([y_0.dimshuffle('x'), y])

        """ Predicts """
        self.y_hat = T.argmax(p_y)
        self.p_y_hat = p_y[T.argmax(p_y - T.min(p_y) * y_all)]

        """ Cost Function """
        self.nll = T.max(self.miss_cost(T.arange(y_all.shape[0]), y_all) * (1 + p_y - self.p_y_hat))
        self.cost = self.nll + L2_reg * L2_sqr(params=self.params) / 2

        """ Optimization """
        self.updates = sgd_w(self.cost, self.params, self.W_p_w, x_p_w, self.W_a_w, x_a_w)

        """ Check Results """
        self.total_p = T.switch(self.y_hat, 1, 0)
        self.total_r = 1 - y_0
        self.correct = y_all[self.y_hat]
        self.correct_t = T.switch(self.correct, T.switch(y_0, 0, 1), 0)
        self.correct_f = T.switch(self.correct, T.switch(y_0, 1, 0), 0)