Пример #1
0
    def backward(self, grad_out):
        # diag
        # D(softmax) / dz_i
        # = D(exp(z_i) * sum(exp(z))^-1) / dz_i
        # = D(exp(z_i)) / dz * sum(exp(z))^-1 + exp(z_i) * D(sum(exp(z))^-1) / dz_i
        # = exp(z_i) * sum(exp(z))^-1 + exp(z_i) * -1 * sum(exp(z))^-2 * exp(z_i)
        # = exp(z_i) / sum(exp(z)) - exp(z_i)^2 / sum(exp(z))^2

        # off diag
        # D(softmax) / dz_j
        # = D(exp(z_i) * sum(exp(z))^-1) / dz_j
        # = exp(z_i) * D(sum(exp(z))^-1) / dz_j)
        # = exp(z_i) * -1 * sum(exp(z))^-2 * exp(z_j)
        # = - exp(z_i) * exp(z_j) / sum(exp(z))^2

        z_min = xp.min(self.z, axis=1)
        z = self.z - z_min[:, xp.newaxis]
        exp = xp.exp(z)
        sum_exp = xp.sum(exp, axis=1)
        outer_mat = -xp.einsum("ij,ik->ijk", exp, exp)
        outer_mat /= sum_exp[:, xp.newaxis, xp.newaxis]**2
        diag = exp / sum_exp[:, xp.newaxis]
        diag_idx = xp.arange(diag.shape[1])
        outer_mat[:, diag_idx, diag_idx] += diag
        grad_in = xp.einsum("ik,ikj->ij", grad_out, outer_mat)
        # Directly use matmul() instead of einsum
        # grad_in = xp.squeeze(xp.matmul(grad_out[:, xp.newaxis, :], outer_mat))
        return grad_in
Пример #2
0
 def forward(self, z):
     self.z = z
     z_min = xp.min(z, axis=1)
     z = z - z_min[:, xp.newaxis]
     exp = xp.exp(z)
     sum_exp = xp.sum(exp, axis=1)[:, xp.newaxis]
     outputs = exp / sum_exp
     return outputs
Пример #3
0
 def backward(self, grad_out):
     grad_out = grad_out.reshape(-1, self.num_rows, self.num_filters)
     # input_rows shape: (num_samples, num_rows, num_filter_inputs)
     # grad_z shape: (num_samples, num_rows, num_filters)
     self.grad_W = xp.einsum("ijk,ijl->kl", self.input_rows, grad_out)
     self.grad_b = xp.sum(grad_out, axis=(0, 1))
     # grad_z shape: (num_samples, num_rows, num_filters)
     # W shape: num_filter_inputs, num_filters
     # grad_rows shape: (num_samples, num_rows, num_filter_inputs)
     grad_rows = xp.dot(grad_out, xp.transpose(self.W))
     # xp.einsum("ijl,kl->ijk", grad_z, self.W)
     grad_in = common.row2im(grad_rows, self.row_indices, self.input_dim,
                             self.filter_size, self.stride, self.pad, xp)
     assert grad_in.shape[1:] == self.input_dim
     return grad_in
Пример #4
0
    def backward(self, grad_out):
        num_samples = grad_out.shape[0]
        grad_out = grad_out.reshape((num_samples, -1))
        # This is basically a cross join between the last axes of x and z.
        # grad_W_i = xp.einsum("ij,ik->ijk", self.x, grad_z)
        # grad_b_i = grad_z
        # self.grad_W = xp.sum(grad_W_i, axis=0)
        # self.grad_b = xp.sum(grad_b_i, axis=0)

        # x shape: (num_samples, num_inputs)
        # z shape: (num_samples, num_outputs)
        self.grad_W = xp.einsum("ij,ik->jk", self.x, grad_out)
        self.grad_b = xp.sum(grad_out, axis=0)
        # xp.einsum("ik,jk->ij", grad_out, self.W)
        grad_in = xp.dot(grad_out, xp.transpose(self.W))
        return grad_in.reshape((-1, *self.input_dim))
Пример #5
0
 def forward(self, yhat, y):
     self.yhat, self.y = yhat, y
     loss = -xp.sum(y * xp.log(yhat + 1e-6))
     return loss