def loss(y, y_pred): """ Cross-entropy (log) loss. Returns the sum (not average!) of the losses per-sample. Parameters ---------- y : numpy array of shape (n, m) Class labels (one-hot with m possible classes) for each of n examples y_pred : numpy array of shape (n, m) Probabilities of each of m classes for the n examples in the batch Returns ------- loss : float The sum of the cross-entropy across classes and examples """ assert_is_binary(y) assert_is_stochastic(y_pred) # prevent taking the log of 0 eps = np.finfo(float).eps # each example is associated with a single class; sum the negative log # probability of the correct label over all samples in the batch. # observe that we are taking advantage of the fact that y is one-hot # encoded! cross_entropy = -np.sum(y * np.log(y_pred + eps)) return cross_entropy
def grad(self, y_true, y_pred, **kwargs): """ ??????????????????????????????????????????????????? Let: f(z) = cross_entropy(softmax(z)). Then: df / dz = softmax(z) - y_true = y_pred - y_true Note that this gradient goes through both the cross-entropy loss AND the softmax non-linearity to return df / dz (rather than df / d softmax(z) ). Input ----- y : numpy array of shape (n, m) A one-hot encoding of the true class labels. Each row constitues a training example, and each column is a different class y_pred: numpy array of shape (n, m) The network predictions for the probability of each of m class labels on each of n examples in a batch. Returns ------- grad : numpy array of shape (n, m) The gradient of the cross-entropy loss with respect to the *input* to the softmax function. """ assert_is_binary(y_true) assert_is_stochastic(y_pred) g = y_pred - y_true return g
def grad(y, y_pred): """ Let: f(z) = cross_entropy(softmax(z)). Then: df / dz = softmax(z) - y_true = y_pred - y_true Note that this gradient goes through both the cross-entropy loss AND the softmax non-linearity to return df / dz (rather than df / d softmax(z) ). Input ----- y : numpy array of shape (n, m) A one-hot encoding of the true class labels. Each row constitues a training example, and each column is a different class y_pred: numpy array of shape (n, m) The network predictions for the probability of each of m class labels on each of n examples in a batch. Returns ------- grad : numpy array of shape (n, m) The gradient of the cross-entropy loss with respect to the *input* to the softmax function. """ assert_is_binary(y) assert_is_stochastic(y_pred) # derivative of xe wrt z is y_pred - y_true, hence we can just # subtract 1 from the probability of the correct class labels grad = y_pred - y # [optional] scale the gradients by the number of examples in the batch # n, m = y.shape # grad /= n return grad
def loss(y_true, y_pred): """Cross-entropy (log) loss. Returns the sum (not average!) of the losses per-sample. :param y_true: (n, m) for n_samples and m_classes :param y_pred: (n, m) :return: """ assert_is_binary(y_true) assert_is_stochastic(y_pred) eps = np.finfo(float).eps cross_entropy = -np.sum(y_true * np.log(y_pred + eps)) return cross_entropy