示例#1
0
    def _backward_propagation(self, seq, y_):
        """
        Parameters
        ----------
        seq : list
            Variable length sequence of elements in the vocabulary. This
            is needed both for its lengths and for its input representations.

        y_ : list
            The label vector.

        Returns
        -------
        tuple
            The matrices of derivatives (d_W_hy, d_b, d_W_hh, d_W_xh).
        
        """
        # Output errors:
        y_err = self.y
        y_err[np.argmax(y_)] -= 1
        h_err = y_err.dot(self.W_hy.T) * d_tanh(self.h[-1])
        d_W_hy = np.outer(self.h[-1], y_err)
        d_b = y_err
        # For accumulating the gradients through time:
        d_W_hh = np.zeros(self.W_hh.shape)
        d_W_xh = np.zeros(self.W_xh.shape)
        # Back-prop through time; the +1 is because the 0th
        # hidden state is the all-0s initial state.
        num_steps = len(seq) + 1
        for t in reversed(range(1, num_steps)):
            d_W_hh += np.outer(self.h[t], h_err)
            word_rep = self.get_word_rep(seq[t - 1])
            d_W_xh += np.outer(word_rep, h_err)
            h_err = h_err.dot(self.W_hh.T) * d_tanh(self.h[t])
        return (d_W_hy, d_b, d_W_hh, d_W_xh)
示例#2
0
    def _backward_propagation(self, seq, y_):
        """
        Parameters
        ----------
        seq : list
            Variable length sequence of elements in the vocabulary. This
            is needed both for its lengths and for its input representations.

        y_ : list
            The label vector.

        Returns
        -------
        tuple
            The matrices of derivatives (d_W_hy, d_b, d_W_hh, d_W_xh).
        
        """            
        # Output errors:
        y_err = self.y
        y_err[np.argmax(y_)] -= 1
        h_err = y_err.dot(self.W_hy.T) * d_tanh(self.h[-1])
        d_W_hy = np.outer(self.h[-1], y_err)
        d_b = y_err
        # For accumulating the gradients through time:
        d_W_hh = np.zeros(self.W_hh.shape)
        d_W_xh = np.zeros(self.W_xh.shape)
        # Back-prop through time; the +1 is because the 0th
        # hidden state is the all-0s initial state.
        num_steps = len(seq)+1
        for t in reversed(range(1, num_steps)):
            d_W_hh += np.outer(self.h[t], h_err)
            word_rep = self.get_word_rep(seq[t-1])
            d_W_xh += np.outer(word_rep, h_err)
            h_err = h_err.dot(self.W_hh.T) * d_tanh(self.h[t])
        return (d_W_hy, d_b, d_W_hh, d_W_xh)
    def _backward_propagation(self, X, Y):
        m = X.shape[1]

        W1 = self.params['W1']
        W2 = self.params['W2']
        Z1 = self.caches['Z1']
        A1 = self.caches['A1']
        Z2 = self.caches['Z2']
        A2 = self.caches['A2']

        # dZ2 = dA2 * dA2/dZ2
        # A2-Y = ( -Y/A2 + (1-Y)/(1-A2) ) * ( A2*(1-A2) )
        shortcut = True
        if shortcut:
            dZ2 = A2 - Y
        else:
            dA2 = -np.divide(Y, A2) + np.divide(
                1 - Y, 1 - A2)  # -(Y/A2) + (1-Y) / (1-A2)
            dZ2 = np.multiply(dA2, d_sigmoid(Z2))  # dA2 * g'(Z2)
        dW2 = np.dot(dZ2, A1.T) / float(m)
        db2 = np.sum(dZ2, axis=1, keepdims=True) / float(m)
        #dZ1 = np.dot(W2.T, dZ2) * (1-np.power(A1, 2))
        dZ1 = np.dot(W2.T, dZ2) * d_tanh(Z1)  # dZ1 = dZA1 * dA1/dZ1 (=g'(Z1))
        dW1 = np.dot(dZ1, X.T) / float(m)
        db1 = np.sum(dZ1, axis=1, keepdims=True) / float(m)

        self.grads['dW2'] = dW2
        self.grads['db2'] = db2
        self.grads['dW1'] = dW1
        self.grads['db1'] = db1
示例#4
0
 def backward_propagation(self, vectree, predictions, ex, labels):
     root = self._get_vector_tree_root(vectree)
     # Output errors:
     y_err = predictions
     y_err[np.argmax(labels)] -= 1
     d_W_hy = np.outer(root, y_err)
     d_b_y = y_err
     # Internal error accumulation:
     d_W = np.zeros_like(self.W)
     d_b = np.zeros_like(self.b)
     h_err = y_err.dot(self.W_hy.T) * d_tanh(root)
     d_W, d_b = self._tree_backprop(vectree, h_err, d_W, d_b)
     return d_W_hy, d_b_y, d_W, d_b
示例#5
0
    def backward_propagation(self, h, predictions, seq, labels):
        """
        Parameters
        ----------
        h : np.array, shape (m, self.hidden_dim)
            Matrix of hidden states. `m` is the shape of the current
            example (which is allowed to vary).
        predictions : np.array, dimension `len(self.classes)`
            Vector of predictions.
        seq : list  of lists
            The original example.
        labels : np.array, dimension `len(self.classes)`
            One-hot vector giving the true label.
        Returns
        -------
        tuple
            The matrices of derivatives (d_W_hy, d_b, d_W_hh, d_W_xh).

        """
        # Output errors:
        y_err = predictions
        y_err[np.argmax(labels)] -= 1
        h_err = y_err.dot(self.W_hy.T) * d_tanh(h[-1])
        d_W_hy = np.outer(h[-1], y_err)
        d_b = y_err
        # For accumulating the gradients through time:
        d_W_hh = np.zeros(self.W_hh.shape)
        d_W_xh = np.zeros(self.W_xh.shape)
        # Back-prop through time; the +1 is because the 0th
        # hidden state is the all-0s initial state.
        num_steps = len(seq)+1
        for t in reversed(range(1, num_steps)):
            d_W_hh += np.outer(h[t], h_err)
            word_rep = self.get_word_rep(seq[t-1])
            d_W_xh += np.outer(word_rep, h_err)
            h_err = h_err.dot(self.W_hh.T) * d_tanh(h[t])
        return (d_W_hy, d_b, d_W_hh, d_W_xh)
示例#6
0
 def _tree_backprop(self, deep_tree, h_err, d_W, d_b):
     # This is the leaf-node condition for vector trees:
     if isinstance(deep_tree, np.ndarray):
         return d_W, d_b
     else:
         # Biased gradient:
         d_b += h_err
         # Get the left and right representations:
         left_subtree, right_subtree = deep_tree[0], deep_tree[1]
         left_rep = self._get_vector_tree_root(left_subtree)
         right_rep = self._get_vector_tree_root(right_subtree)
         # Combine them and update d_W:
         combined = np.concatenate((left_rep, right_rep))
         d_W += np.outer(combined, h_err)
         # Get the gradients for both child nodes:
         h_err = h_err.dot(self.W.T) * d_tanh(combined)
         # Split the gradients between the children and continue
         # backpropagation down each subtree:
         l_err = h_err[:self.embed_dim]
         r_err = h_err[self.embed_dim:]
         d_W, d_b = self._tree_backprop(left_subtree, l_err, d_W, d_b)
         d_W, d_b = self._tree_backprop(right_subtree, r_err, d_W, d_b)
     return d_W, d_b
示例#7
0
def test_d_tanh(arg, expected):
    assert np.array_equal(utils.d_tanh(arg), expected)