Exemplo n.º 1
0
def test_010_softmax():
    """Test Case for sigmoid
    """
    u = ACTIVATION_DIFF_ACCEPTANCE_VALUE
    P = softmax(np.array([2.44756739, 2.13945115]).astype(TYPE_FLOAT))
    E = np.array([0.57642539, 0.42357461]).astype(TYPE_FLOAT)
    assert np.all(np.abs(P - E) < u)

    for _ in range(NUM_MAX_TEST_TIMES):
        N: int = np.random.randint(1, NUM_MAX_BATCH_SIZE)
        M: int = np.random.randint(2, NUM_MAX_NODES)
        X = MAX_ACTIVATION_VALUE * np.random.randn(N, M).astype(TYPE_FLOAT)
        np.all(np.isfinite(softmax(X)))
Exemplo n.º 2
0
    def predict(self, x):
        w1, w2 = self.params['w1'], self.params['w2']
        b1, b2 = self.params['b1'], self.params['b2']

        a1 = np.dot(x, w1) + b1
        z1 = sigmoid(a1)
        a2 = np.dot(z1, w2) + b2
        y = softmax(a2)
        return y
Exemplo n.º 3
0
def predict(network, x):
    W1, W2, W3 = network['W1'], network['W2'], network['W3']
    b1, b2, b3 = network['b1'], network['b2'], network['b3']
    a1 = np.dot(x, W1) + b1
    z1 = sigmoid(a1)
    a2 = np.dot(z1, W2) + b2
    z2 = sigmoid(a2)
    a3 = np.dot(z2, W3) + b3
    y = softmax(a3)
    return y
Exemplo n.º 4
0
    def forward(self, x, t):
        self.t = t
        self.y = softmax(x)

        # one-hot vector to
        if self.t.size == self.y.size:
            self.t = self.t.argmax(axis=1)

        loss = cross_entropy_error(self.y, self.t)

        return loss
Exemplo n.º 5
0
    def grad(self, x, t):
        w1, w2 = self.params['w1'], self.params['w2']
        b1, b2 = self.params['b1'], self.params['b2']
        grads = {}

        # forward
        a1 = np.dot(x, w1) + b1
        z1 = sigmoid(a1)
        a2 = np.dot(z1, w2) + b2
        y = softmax(a2)

        # backward
        dy = (y - t) / x.shape[0]  # softmax with entropy loss's gradient, dL/dy
        grads['w2'] = np.dot(z1.T, dy)
        grads['b2'] = np.sum(dy, axis=0)

        da1 = np.dot(dy, w2.T)
        dz1 = (1.0 - sigmoid(a1)) * sigmoid(a1) * da1  # sigmoid's gradient
        grads['w1'] = np.dot(x.T, dz1)
        grads['b1'] = np.sum(dz1, axis=0)

        return grads
Exemplo n.º 6
0
    def predict(self, X):
        """
        Responsibility:
            Generate a concrete prediction 0/1 for binary classification
            and an index for categorical classification

            For binary classification where M=1, 1/True if Xi > 0 else 0.

            For categorical classification, argmax(X, axis=1) that identifies
            the class that gives the max probability.
        Args:
            X: scores
        Returns:
            Predictions
        """
        assert isinstance(X, np.ndarray) and X.dtype == TYPE_FLOAT, \
            f"Only np array of type {TYPE_FLOAT} is accepted"
        if X.ndim <= 1:
            X = np.array(X).reshape(1, -1)

        if self._log_loss_function == sigmoid_cross_entropy_log_loss:
            return (X > 0).astype(TYPE_LABEL)
        else:
            return np.argmax(softmax(X), axis=1).astype(TYPE_LABEL)
Exemplo n.º 7
0
def test_020_cross_entropy_log_loss_1d(caplog):
    """
    Objective:
        Test the categorical log loss values for P in 1 dimension.

    Constraints:
        1. The numerical gradient gn = (-t * logarithm(p+h) + t * logarithm(p-h)) / 2h.
        2. The numerical gradient gn is within +/- u within the analytical g = -T/P.

    P: Probabilities from softmax of shape (M,)
    M: Number of nodes in the cross_entropy_log_loss layer.
    T: Labels

    Note:
        log(P=1) -> 0
        dlog(x)/dx = 1/x
    """
    def f(P: np.ndarray, T: np.ndarray):
        return np.sum(cross_entropy_log_loss(P, T))

    # caplog.set_level(logging.DEBUG, logger=Logger.name)

    h: TYPE_FLOAT = OFFSET_DELTA
    u: TYPE_FLOAT = GRADIENT_DIFF_ACCEPTANCE_VALUE

    # --------------------------------------------------------------------------------
    # For (P, T): P[index] = True/1, OHE label T[index] = 1 where
    # P=[0,0,0,...,1,...0], T = [0,0,0,...1,...0]. T[i] == 1
    #
    # Do not forget the Jacobian shape is (N,) and calculate each element.
    # 1. For T=1, loss L = -log(Pi) = 0 and dL/dP=(1/Pi)= -1 is expected.
    # 2. For T=0, Loss L = (-log(0+offset+h)-log(0+offset-h)) / 2h = 0 is expected.
    # --------------------------------------------------------------------------------
    M: TYPE_INT = np.random.randint(2, NUM_MAX_NODES)
    index: TYPE_INT = TYPE_INT(np.random.randint(
        0, M))  # Position of the true label in P
    P1 = np.zeros(M, dtype=TYPE_FLOAT)
    P1[index] = TYPE_FLOAT(1.0)
    T1 = np.zeros(M, dtype=TYPE_LABEL)
    T1[index] = TYPE_LABEL(1)

    # Analytica correct gradient for P=1, T=1
    AG = np.zeros_like(P1, dtype=TYPE_FLOAT)
    AG[index] = TYPE_FLOAT(-1)  # dL/dP = -1

    EGN1 = np.zeros_like(P1, dtype=TYPE_FLOAT)  # Expected numerical gradient
    EGN1[index] = (-1 * logarithm(TYPE_FLOAT(1.0 + h)) + TYPE_FLOAT(1) *
                   logarithm(TYPE_FLOAT(1.0 - h))) / TYPE_FLOAT(2 * h)
    assert np.all(np.abs(EGN1-AG) < u), \
        "Expected EGN-1<%s but %s\nEGN=\n%s" % (u, (EGN1-AG), EGN1)

    GN1 = numerical_jacobian(partial(f, T=T1), P1)
    assert np.all(np.abs(GN1-AG) < u), \
        "Expected GN-1<%s but %s\nGN=\n%s" % (u, (GN1-AG), GN1)

    # The numerical gradient gn = (-t * logarithm(p+h) + t * logarithm(p-h)) / 2h
    assert GN1.shape == EGN1.shape
    assert np.all(np.abs(EGN1-GN1) < u), \
        "Expected GN1==EGN1 but GN1-EGN1=\n%sP=\n%s\nT=%s\nEGN=\n%s\nGN=\n%s\n" \
        % (np.abs(GN1-EGN1), P1, T1, EGN1, GN1)

    # The numerical gradient gn is within +/- u within the analytical g = -T/P
    G1 = np.zeros_like(P1, dtype=TYPE_FLOAT)
    G1[T1 == 1] = -1 * (T1[index] / P1[index])
    # G1[T1 != 0] = 0
    check.equal(np.all(np.abs(G1 - GN1) < u), True,
                "G1-GN1 %s\n" % np.abs(G1 - GN1))

    # --------------------------------------------------------------------------------
    # For (P, T): P[index] = np uniform(), index label T=index
    # --------------------------------------------------------------------------------
    for _ in range(NUM_MAX_TEST_TIMES):
        M = np.random.randint(2, NUM_MAX_NODES)  # M > 1
        T2 = TYPE_LABEL(np.random.randint(0, M))  # location of the truth
        P2 = np.zeros(M, dtype=TYPE_FLOAT)
        while not (x := TYPE_FLOAT(
                np.random.uniform(low=-BOUNDARY_SIGMOID,
                                  high=BOUNDARY_SIGMOID))):
            pass
        p = softmax(x)
        P2[T2] = p

        # --------------------------------------------------------------------------------
        # The Jacobian G shape is the same with P.shape.
        # G:[0, 0, ...,g, 0, ...] where Gi is numerical gradient close to -1/(1+k).
        # --------------------------------------------------------------------------------
        N2 = np.zeros_like(P2, dtype=TYPE_FLOAT)
        N2[T2] = TYPE_FLOAT(-1) * (logarithm(p + h) -
                                   logarithm(p - h)) / TYPE_FLOAT(2 * h)
        N2 = numerical_jacobian(partial(f, T=T2), P2)

        # The numerical gradient gn = (-t * logarithm(p+h) + t * logarithm(p-h)) / 2h
        assert N2.shape == N2.shape
        assert np.all(np.abs(N2-N2) < u), \
            f"Delta expected to be < {u} but \n{np.abs(N2-N2)}"

        G2 = np.zeros_like(P2, dtype=TYPE_FLOAT)
        G2[T2] = -1 / p

        # The numerical gradient gn is within +/- u within the analytical g = -T/P
        check.equal(np.all(np.abs(G2 - N2) < u), True,
                    "G2-N2 %s\n" % np.abs(G2 - N2))
Exemplo n.º 8
0
def test_020_cross_entropy_log_loss_2d(caplog):
    """
    Objective:
        Test case for cross_entropy_log_loss(X, T) for X:shape(N,M), T:shape(N,)
    Expected:
    """
    def f(P: np.ndarray, T: np.ndarray):
        """Loss function"""
        # For P.ndim==2 of shape (N, M), cross_entropy_log_loss() returns (N,).
        # Each of which has the loss for P[n].
        # If divided by P.shape[0] or N, the loss gets 1/N, which is wrong.
        # This is not a gradient function but a loss function.
        # return np.sum(cross_entropy_log_loss(P, T)) / P.shape[0]

        return np.sum(cross_entropy_log_loss(P, T))

    # caplog.set_level(logging.DEBUG, logger=Logger.name)

    h: TYPE_FLOAT = OFFSET_DELTA
    u: TYPE_FLOAT = GRADIENT_DIFF_ACCEPTANCE_VALUE

    for _ in range(NUM_MAX_TEST_TIMES):
        # --------------------------------------------------------------------------------
        # [2D test case]
        # P:(N, M) is probability matrix where Pnm = p, 0 <=n<N-1, 0<=m<M-1
        # T:(N,)   is index label where Tn=m is label as integer k.g. m=3 for 3rd label.
        # Pnm = log(P[i][j])
        # L = -log(p), -> dlog(P)/dP -> -1 / (p)
        #
        # Keep p value away from 0. As p gets close to 0, the log(p+/-h) gets large e.g
        # -11.512925464970229, hence log(p+/-h) / 2h explodes.
        # --------------------------------------------------------------------------------
        while not (x := TYPE_FLOAT(
                np.random.uniform(low=-BOUNDARY_SIGMOID,
                                  high=BOUNDARY_SIGMOID))):
            pass
        p = softmax(x)
        N = np.random.randint(1, NUM_MAX_BATCH_SIZE)
        M = np.random.randint(2, NUM_MAX_NODES)
        # label index, not OHE
        T = np.random.randint(0, M, N).astype(
            TYPE_LABEL)  # N rows of labels, max label value is M-1
        P = np.zeros((N, M)).astype(TYPE_FLOAT)
        P[range(N),  # Set p at random row position
          T] = p
        E = np.zeros_like(P).astype(TYPE_FLOAT)
        E[range(N),  # Set p at random row position
          T] = (TYPE_FLOAT(-1) * logarithm(p + h) +
                TYPE_FLOAT(1) * logarithm(p - h)) / (TYPE_FLOAT(2) * h)

        G = numerical_jacobian(partial(f, T=T), P)
        assert E.shape == G.shape, \
            f"Jacobian shape is expected to be {E.shape} but {G.shape}."
        assert np.all(np.abs(E-G) < u), \
            f"Delta expected to be < {u} but \n{np.abs(E-G)}"

        A = np.zeros_like(P).astype(TYPE_FLOAT)
        A[range(N),  # Set p at random row position
          T] = -1 / p

        check.equal(np.all(np.abs(A - G) < u), True,
                    "A-G %s\n" % np.abs(A - G))
Exemplo n.º 9
0
    for _ in range(NUM_MAX_TEST_TIMES):
        # --------------------------------------------------------------------------------
        # [1D test case]
        # P:[0, 0, ..., 1, 0, ...] where Pi = 1
        # T:[0, 0, ..., 1, 0, ...] is OHE label where Ti=1
        # sum(-t * log(p+k)) -> log(1+k)
        # dlog(P+k)/dP -> -1 / (1+k)
        # --------------------------------------------------------------------------------
        M = np.random.randint(2, NUM_MAX_NODES)  # M > 1
        index = np.random.randint(0, M)  # location of the truth
        while not (x := TYPE_FLOAT(
                np.random.uniform(low=-BOUNDARY_SIGMOID,
                                  high=BOUNDARY_SIGMOID))):
            pass
        p = softmax(x)
        P3 = np.zeros(M, dtype=TYPE_FLOAT)
        P3[index] = p
        T3 = np.zeros(M).astype(TYPE_LABEL)  # OHE index
        T3[index] = TYPE_LABEL(1)

        # --------------------------------------------------------------------------------
        # The Jacobian G shape is the same with P.shape.
        # --------------------------------------------------------------------------------
        N3 = np.zeros_like(P3, dtype=TYPE_FLOAT)
        N3[index] = TYPE_FLOAT(-1 * logarithm(p + h) +
                               1 * logarithm(p - h)) / TYPE_FLOAT(2 * h)
        N3 = numerical_jacobian(partial(f, T=T3), P3)
        assert N3.shape == N3.shape
        assert np.all(np.abs(N3-N3) < u), \
            f"Delta expected to be < {u} but \n{np.abs(N3-N3)}"
Exemplo n.º 10
0
    def forward(self, x, t):
        self.t = t
        self.y = softmax(x)
        self.loss = cross_entropy_error(self.y, self.t)

        return self.loss
Exemplo n.º 11
0
 def forward(self, x, **kwargs):
     self.y = kwargs['y']
     self.y_hat = softmax(x)
     self.loss = cross_entropy_error(self.y, self.y_hat)
     return self.loss
Exemplo n.º 12
0
def disabled_test_040_softmax_log_loss_2d(caplog):
    """
    TODO: Disabled as need to redesign numerical_jacobian for 32 bit floating.

    Objective:
        Verify the forward path constraints:
        1. Layer output L/loss is np.sum(softmax_cross_entropy_log_loss) / N.
        2. gradient_numerical() == numerical_jacobian(objective, X).

        Verify the backward path constraints:
        1. Analytical gradient G: gradient() == (P-1)/N
        2. Analytical gradient G is close to GN: gradient_numerical().
    """
    caplog.set_level(logging.DEBUG)

    # --------------------------------------------------------------------------------
    # Instantiate a CrossEntropyLogLoss layer
    # --------------------------------------------------------------------------------
    name = "test_040_softmax_log_loss_2d_ohe"

    profiler = cProfile.Profile()
    profiler.enable()

    for _ in range(NUM_MAX_TEST_TIMES):
        N: int = np.random.randint(1, NUM_MAX_BATCH_SIZE)
        M: int = np.random.randint(2, NUM_MAX_NODES)  # number of node > 1
        _layer = layer.CrossEntropyLogLoss(
            name=name,
            num_nodes=M,
            log_loss_function=softmax_cross_entropy_log_loss,
            log_level=logging.DEBUG)

        # ================================================================================
        # Layer forward path
        # ================================================================================
        X = np.random.randn(N, M).astype(TYPE_FLOAT)
        T = np.zeros_like(X, dtype=TYPE_LABEL)  # OHE labels.
        T[np.arange(N), np.random.randint(0, M, N)] = int(1)

        # log_loss function require (X, T) in X(N, M), and T(N, M) in index label format.
        X, T = transform_X_T(X, T)
        _layer.T = T
        Logger.debug("%s: X is \n%s\nT is \n%s", name, X, T)

        # --------------------------------------------------------------------------------
        # Expected analytical gradient EG = (dX/dL) = (A-T)/N
        # --------------------------------------------------------------------------------
        A = softmax(X)
        EG = np.copy(A)
        EG[np.arange(N), T] -= TYPE_FLOAT(
            1)  # Shape(N,), subtract from elements for T=1 only
        EG /= TYPE_FLOAT(N)

        # --------------------------------------------------------------------------------
        # Total loss Z = np.sum(J)/N
        # Expected loss EL = -sum(T*log(_A))
        # (J, P) = softmax_cross_entropy_log_loss(X, T) and J:shape(N,) where J:shape(N,)
        # is loss for each input and P is activation by sigmoid(X).
        # --------------------------------------------------------------------------------
        L = _layer.function(X)
        J, P = softmax_cross_entropy_log_loss(X, T)
        EL = np.array(-np.sum(logarithm(A[np.arange(N), T])) / N,
                      dtype=TYPE_FLOAT)

        # Constraint: A == P as they are sigmoid(X)
        assert np.all(np.abs(A-P) < ACTIVATION_DIFF_ACCEPTANCE_VALUE), \
            f"Need A==P==sigmoid(X) but A=\n{A}\n P=\n{P}\n(A-P)=\n{(A-P)}\n"

        # Constraint: Log loss layer output L == sum(J) from the log loss function
        Z = np.array(np.sum(J) / N, dtype=TYPE_FLOAT)
        assert np.array_equal(L, Z), \
            f"Need log loss layer output L == sum(J) but L=\n{L}\nZ=\n{Z}."

        # Constraint: L/loss is close to expected loss EL.
        assert np.all(np.abs(EL-L) < LOSS_DIFF_ACCEPTANCE_VALUE), \
            "Need EL close to L but \nEL=\n{EL}\nL=\n{L}\n"

        # constraint: gradient_numerical() == numerical_jacobian(objective, X)
        # TODO: compare the diff to accommodate numerical errors.
        GN = _layer.gradient_numerical()  # [dL/dX] from the layer

        def objective(x):
            """Function to calculate the scalar loss L for cross entropy log loss"""
            j, p = softmax_cross_entropy_log_loss(x, T)
            return np.array(np.sum(j) / N, dtype=TYPE_FLOAT)

        EGN = numerical_jacobian(objective, X)  # Expected numerical dL/dX
        assert np.array_equal(GN[0], EGN), \
            f"GN[0]==EGN expected but GN[0] is \n%s\n EGN is \n%s\n" % (GN[0], EGN)

        # ================================================================================
        # Layer backward path
        # ================================================================================

        # constraint: Analytical gradient G: gradient() == EG == (P-1)/N.
        dY = TYPE_FLOAT(1)
        G = _layer.gradient(dY)
        assert np.all(np.abs(G-EG) <= GRADIENT_DIFF_ACCEPTANCE_VALUE), \
            f"Layer gradient dL/dX \n{G} \nneeds to be \n{EG}."

        # constraint: Analytical gradient G is close to GN: gradient_numerical().
        assert \
            np.all(np.abs(G - GN[0]) <= GRADIENT_DIFF_ACCEPTANCE_VALUE) or \
            np.all(np.abs(G - GN[0]) <= np.abs(GRADIENT_DIFF_ACCEPTANCE_RATIO * GN[0])), \
            f"dX is \n{G}\nGN[0] is \n{GN[0]}\nRatio * GN[0] is \n{GRADIENT_DIFF_ACCEPTANCE_RATIO * GN[0]}.\n"

        # constraint: Gradient g of the log loss layer needs -1 < g < 1
        # abs(P-T) = abs(sigmoid(X)-T) cannot be > 1.
        assert np.all(np.abs(G) < 1), \
            f"Log loss layer gradient cannot be < -1 nor > 1 but\n{G}"
        assert np.all(np.abs(GN[0]) < (1+GRADIENT_DIFF_ACCEPTANCE_RATIO)), \
            f"Log loss layer gradient cannot be < -1 nor > 1 but\n{GN[0]}"

    profiler.disable()
    profiler.print_stats(sort="cumtime")
Exemplo n.º 13
0
 def forward(self, x):
     self.out = softmax(x)
     return self.out
Exemplo n.º 14
0
def validate_relu_neuron_training(matmul: Matmul,
                                  activation: ReLU,
                                  loss: CrossEntropyLogLoss,
                                  X: np.ndarray,
                                  T: np.ndarray,
                                  num_epochs: int = 100,
                                  test_numerical_gradient: bool = False,
                                  callback: Callable = None):
    activation.objective = loss.function
    matmul.objective = compose(activation.function, loss.function)
    objective = compose(matmul.function, matmul.objective)

    num_no_progress: int = 0  # how many time when loss L not decreased.
    history: List[np.ndarray] = []

    loss.T = T
    for i in range(num_epochs):
        L = objective(X)
        N = X.shape[0]
        P = softmax(relu(np.matmul(matmul.X, matmul.W.T)))
        EDA = expected_gradient_from_log_loss(P=P, T=T, N=N)

        # ********************************************************************************
        # Constraint: Expected gradients must match actual
        # ********************************************************************************
        validate_relu_neuron_round_trip(matmul=matmul,
                                        activation=activation,
                                        X=X,
                                        dA=EDA)

        # --------------------------------------------------------------------------------
        # gradient descent and get the analytical dL/dX, dL/dW
        # --------------------------------------------------------------------------------
        previous_W = copy.deepcopy(matmul.W)
        matmul.update()  # dL/dX, dL/dW

        # ********************************************************************************
        #  Constraint. W in the matmul has been updated by the gradient descent.
        # ********************************************************************************
        Logger.debug("W after is \n%s", matmul.W)
        if np.array_equal(previous_W, matmul.W):
            Logger.warning("W has not been updated")

        # ********************************************************************************
        # Constraint: Objective/Loss L(Yn+1) after gradient descent < L(Yn)
        # ********************************************************************************
        if i > 0 and L >= history[-1]:
            Logger.warning(
                "Iteration [%i]: Loss[%s] has not improved from the previous [%s] for %s times.",
                i, L, history[-1], num_no_progress + 1)
            # --------------------------------------------------------------------------------
            # Reduce the learning rate can make the situation worse.
            # When reduced the lr every time L >= history, the (L >= history) became successive
            # and eventually exceeded 50 successive non-improvement ending in failure.
            # Keep the learning rate make the L>=history more frequent but still up to 3
            # successive events, and the training still kept progressing.
            # --------------------------------------------------------------------------------
            num_no_progress += 1
            if num_no_progress > 5:
                matmul.lr = matmul.lr * 0.95

            if num_no_progress > 50:
                Logger.error(
                    "The training has no progress more than %s times.",
                    num_no_progress)
                break
        else:
            num_no_progress = 0

        history.append(L)

        if callback:
            callback(W=matmul.W)

    return history
Exemplo n.º 15
0
def test_030_objective_methods_1d_ohe():
    """
    Objective:
        Verify the forward path constraints:
        1. Layer output L/loss is np.sum(cross_entropy_log_loss(softmax(X), T)) / N.
        2. gradient_numerical() == numerical Jacobian numerical_jacobian(O, X).

        Verify the backward path constraints:
        1. Analytical gradient G: gradient() == (P-1)/N
        2. Analytical gradient G is close to GN: gradient_numerical().
    Expected:
        Initialization detects the access to the non-initialized parameters and fails.
        
        For X.ndim > 0, the layer transform X into 2D so as to use the numpy tuple-
        like indexing:
        P[
            (0,3),
            (2,4)
        ]
        Hence, the shape of GN, G are 2D.
    """
    # --------------------------------------------------------------------------------
    # Instantiate a CrossEntropyLogLoss layer
    # --------------------------------------------------------------------------------
    name = "test_030_objective_methods_1d_ohe"
    N = 1

    for _ in range(NUM_MAX_TEST_TIMES):
        M: int = np.random.randint(2, NUM_MAX_NODES)
        assert M >= 2, "Softmax is for multi label classification. "\
                       " Use Sigmoid for binary classification."

        _layer = layer.CrossEntropyLogLoss(name=name,
                                           num_nodes=M,
                                           log_level=logging.DEBUG)

        # ================================================================================
        # Layer forward path
        # ================================================================================
        X = np.random.randn(M).astype(TYPE_FLOAT)
        T = np.zeros_like(X, dtype=TYPE_LABEL)  # OHE labels.
        T[np.random.randint(0, M)] = TYPE_LABEL(1)
        _layer.T = T

        P = softmax(X)
        EG = ((P - T) / N).reshape(1, -1).astype(
            TYPE_FLOAT)  # Expected analytical gradient dL/dX = (P-T)/N

        Logger.debug("%s: X is \n%s\nT is %s\nP is %s\nEG is %s\n", name, X, T,
                     P, EG)

        # --------------------------------------------------------------------------------
        # constraint: L/loss == np.sum(cross_entropy_log_loss(softmax(X), T)) / N.
        # --------------------------------------------------------------------------------
        L = _layer.function(X)
        Z = np.array(np.sum(cross_entropy_log_loss(softmax(X), T)),
                     dtype=TYPE_FLOAT) / TYPE_FLOAT(N)
        assert np.array_equal(
            L, Z), f"SoftmaxLogLoss output should be {L} but {Z}."

        # --------------------------------------------------------------------------------
        # constraint: gradient_numerical() == numerical Jacobian numerical_jacobian(O, X)
        # Use a dummy _layer for the objective function because using the "_layer"
        # updates the X, Y which can interfere the independence of the _layer.
        # --------------------------------------------------------------------------------
        GN = _layer.gradient_numerical()  # [dL/dX] from the _layer

        # --------------------------------------------------------------------------------
        # Cannot use CrossEntropyLogLoss.function() to simulate the objective function L.
        # because it causes applying transform_X_T multiple times.
        # Because internally transform_X_T(X, T) has transformed T into the index label
        # in 1D with with length 1 by "T = T.reshape(-1)".
        # Then providing X in 1D into "dummy.function(x)" re-run "transform_X_T(X, T)"
        # again. The (X.ndim == T.ndim ==1) as an input and T must be OHE label for such
        # combination and T.shape == P.shape must be true for OHE labels.
        # However, T has been converted into the index format already by transform_X_T
        # (applying transform_X_T multiple times) and (T.shape=(1,1), X.shape=(1, > 1)
        # that violates the (X.shape == T.shape) constraint.
        # --------------------------------------------------------------------------------
        # dummy = CrossEntropyLogLoss(
        #     name="dummy",
        #     num_nodes=M,
        #     log_level=logging.DEBUG
        # )
        # dummy.T = T
        # dummy.objective = objective
        # dummy.function(X)
        # --------------------------------------------------------------------------------
        # O = lambda x: dummy.objective(dummy.function(x))    # Objective function
        O = lambda x: np.sum(cross_entropy_log_loss(softmax(x), T),
                             dtype=TYPE_FLOAT) / TYPE_FLOAT(N)
        # --------------------------------------------------------------------------------
        EGN = numerical_jacobian(O, X).reshape(1,
                                               -1)  # Expected numerical dL/dX
        assert np.array_equal(GN[0], EGN), \
            f"Layer gradient_numerical GN \n{GN} \nneeds to be \n{EGN}."

        # ================================================================================
        # Layer backward path
        # ================================================================================
        # --------------------------------------------------------------------------------
        # constraint: Analytical gradient G: gradient() == (P-1)/N.
        # --------------------------------------------------------------------------------
        dY = TYPE_FLOAT(1)
        G = _layer.gradient(dY)
        assert np.all(np.abs(G-EG) <= GRADIENT_DIFF_ACCEPTANCE_VALUE), \
            f"Layer gradient dL/dX \n{G} \nneeds to be \n{EG} but G-EG \n{np.abs(G-EG)}\n"

        # --------------------------------------------------------------------------------
        # constraint: Analytical gradient G is close to GN: gradient_numerical().
        # --------------------------------------------------------------------------------
        assert \
            np.all(np.abs(G - GN[0]) <= GRADIENT_DIFF_ACCEPTANCE_VALUE) or \
            np.all(np.abs(G-GN[0]) <= np.abs(GRADIENT_DIFF_ACCEPTANCE_RATIO * GN[0])), \
            f"dX is \n{G}\nGN[0] is \n{GN[0]}\nRatio * GN[0] is \n{GRADIENT_DIFF_ACCEPTANCE_RATIO * GN[0]}.\n"
Exemplo n.º 16
0
def disabled_test_030_objective_methods_2d_ohe():
    """
    TODO: Disabled as need to redesign numerical_jacobian for 32 bit floating.

    Objective:
        Verify the forward path constraints:
        1. Layer output L/loss is np.sum(cross_entropy_log_loss(softmax(X), T)) / N.
        2. gradient_numerical() == numerical Jacobian numerical_jacobian(O, X).

        Verify the backward path constraints:
        1. Analytical gradient G: gradient() == (P-1)/N
        2. Analytical gradient G is close to GN: gradient_numerical().
    Expected:
        Initialization detects the access to the non-initialized parameters and fails.
    """
    def objective(X: np.ndarray) -> Union[float, np.ndarray]:
        """Dummy objective function to calculate the loss L"""
        assert X.ndim == 0, "The output of the log loss should be of shape ()"
        return X

    # --------------------------------------------------------------------------------
    # Instantiate a CrossEntropyLogLoss layer
    # --------------------------------------------------------------------------------
    name = "test_030_objective_methods_2d_ohe"
    for _ in range(NUM_MAX_TEST_TIMES):
        N: int = np.random.randint(1, NUM_MAX_BATCH_SIZE)
        M: int = np.random.randint(2, NUM_MAX_NODES)
        assert M >= 2, "Softmax is for multi label classification. "\
                       " Use Sigmoid for binary classification."

        _layer = layer.CrossEntropyLogLoss(name=name,
                                           num_nodes=M,
                                           log_level=logging.DEBUG)
        _layer.objective = objective

        # ================================================================================
        # Layer forward path
        # ================================================================================
        X = np.random.randn(N, M).astype(TYPE_FLOAT)
        T = np.zeros_like(X, dtype=TYPE_LABEL)  # OHE labels.
        T[np.arange(N), np.random.randint(0, M, N)] = TYPE_LABEL(1)
        _layer.T = T

        Logger.debug("%s: X is \n%s\nT is \n%s", name, X, T)

        P = softmax(X)
        EG = (P - T) / N  # Expected analytical gradient dL/dX = (P-T)/N

        # --------------------------------------------------------------------------------
        # constraint: L/loss == np.sum(cross_entropy_log_loss(softmax(X), T)) / N.
        # --------------------------------------------------------------------------------
        L = _layer.function(X)
        Z = np.array(np.sum(cross_entropy_log_loss(softmax(X), T))) / N
        assert np.array_equal(
            L, Z), f"SoftmaxLogLoss output should be {L} but {Z}."

        # --------------------------------------------------------------------------------
        # constraint: gradient_numerical() == numerical Jacobian numerical_jacobian(O, X)
        # --------------------------------------------------------------------------------
        GN = _layer.gradient_numerical()  # [dL/dX] from the _layer

        # --------------------------------------------------------------------------------
        # DO not use CrossEntropyLogLoss.function() to simulate the objective function for
        # the expected GN. See the same part in test_030_objective_methods_1d_ohe().
        # --------------------------------------------------------------------------------
        # dummy= CrossEntropyLogLoss(
        #     name=name,
        #     num_nodes=M,
        #     log_level=logging.DEBUG
        # )
        # dummy.T = T
        # dummy.objective = objective
        # O = lambda x: dummy.objective(dummy.function(x))    # Objective function
        O = lambda x: np.sum(cross_entropy_log_loss(softmax(x), T)) / N
        # --------------------------------------------------------------------------------

        EGN = numerical_jacobian(O, X)  # Expected numerical dL/dX
        assert np.array_equal(GN[0], EGN), \
            f"GN[0]==EGN expected but GN[0] is \n%s\n EGN is \n%s\n" % (GN[0], EGN)

        # ================================================================================
        # Layer backward path
        # ================================================================================
        # --------------------------------------------------------------------------------
        # constraint: Analytical gradient G: gradient() == (P-1)/N.
        # --------------------------------------------------------------------------------
        dY = float(1)
        G = _layer.gradient(dY)
        assert np.all(abs(G-EG) <= GRADIENT_DIFF_ACCEPTANCE_VALUE), \
            f"Layer gradient dL/dX \n{G} \nneeds to be \n{EG}."

        # --------------------------------------------------------------------------------
        # constraint: Analytical gradient G is close to GN: gradient_numerical().
        # --------------------------------------------------------------------------------
        assert \
            np.all(np.abs(G - GN[0]) <= GRADIENT_DIFF_ACCEPTANCE_VALUE) or \
            np.all(np.abs(G - GN[0]) <= np.abs(GRADIENT_DIFF_ACCEPTANCE_RATIO * GN[0])), \
            f"dX is \n{G}\nGN[0] is \n{GN[0]}\nRatio * GN[0] is \n{GRADIENT_DIFF_ACCEPTANCE_RATIO * GN[0]}.\n"
def train_binary_classifier(N: int,
                            D: int,
                            M: int,
                            X: np.ndarray,
                            T: np.ndarray,
                            W: np.ndarray,
                            log_loss_function: Callable,
                            optimizer: Optimizer,
                            num_epochs: int = 100,
                            test_numerical_gradient: bool = False,
                            log_level: int = logging.ERROR,
                            callback: Callable = None):
    """Test case for binary classification with matmul + log loss.
    Args:
        N: Batch size
        D: Number of features
        M: Number of nodes. 1 for sigmoid and 2 for softmax
        X: train data
        T: labels
        W: weight
        log_loss_function: cross entropy logg loss function
        optimizer: Optimizer
        num_epochs: Number of epochs to run
        test_numerical_gradient: Flag if test the analytical gradient with the numerical one.
        log_level: logging level
        callback: callback function to invoke at the each epoch end.
    """
    name = __name__
    assert isinstance(T, np.ndarray) and np.issubdtype(
        T.dtype, np.integer) and T.ndim == 1 and T.shape[0] == N
    assert isinstance(
        X, np.ndarray) and X.dtype == TYPE_FLOAT and X.ndim == 2 and X.shape[
            0] == N and X.shape[1] == D
    assert isinstance(
        W, np.ndarray) and W.dtype == TYPE_FLOAT and W.ndim == 2 and W.shape[
            0] == M and W.shape[1] == D + 1
    assert num_epochs > 0 and N > 0 and D > 0

    assert ((log_loss_function == sigmoid_cross_entropy_log_loss and M == 1) or
            (log_loss_function == softmax_cross_entropy_log_loss and M >= 2))

    # --------------------------------------------------------------------------------
    # Instantiate a CrossEntropyLogLoss layer
    # --------------------------------------------------------------------------------
    loss = CrossEntropyLogLoss(name="loss",
                               num_nodes=M,
                               log_loss_function=log_loss_function,
                               log_level=log_level)

    # --------------------------------------------------------------------------------
    # Instantiate a Matmul layer
    # --------------------------------------------------------------------------------
    matmul = Matmul(name="matmul",
                    num_nodes=M,
                    W=W,
                    optimizer=optimizer,
                    log_level=log_level)
    matmul.objective = loss.function

    num_no_progress: int = 0  # how many time when loss L not decreased.
    loss.T = T
    history: List[np.ndarray] = [loss.function(matmul.function(X))]

    for i in range(num_epochs):
        # --------------------------------------------------------------------------------
        # Layer forward path
        # Calculate the matmul output Y=f(X), and get the loss L = objective(Y)
        # Test the numerical gradient dL/dX=matmul.gradient_numerical().
        # --------------------------------------------------------------------------------
        Y = matmul.function(X)
        L = loss.function(Y)

        if not (i % 50): print(f"iteration {i} Loss {L}")
        Logger.info("%s: iteration[%s]. Loss is [%s]", name, i, L)

        # --------------------------------------------------------------------------------
        # Constraint: 1. Objective/Loss L(Yn+1) after gradient descent < L(Yn)
        # --------------------------------------------------------------------------------
        if L >= history[-1] and (i % 20) == 1:
            Logger.warning(
                "Iteration [%i]: Loss[%s] has not improved from the previous [%s].",
                i, L, history[-1])
            if (num_no_progress := num_no_progress + 1) > 20:
                Logger.error(
                    "The training has no progress more than %s times.",
                    num_no_progress)
                # break
        else:
            num_no_progress = 0

        history.append(L)

        # --------------------------------------------------------------------------------
        # Expected dL/dW.T = X.T @ dL/dY = X.T @ (P-T) / N, and dL/dX = dL/dY @ W
        # P = sigmoid(X) or softmax(X)
        # dL/dX = dL/dY * W is to use W BEFORE updating W.
        # --------------------------------------------------------------------------------
        P = None
        if log_loss_function == sigmoid_cross_entropy_log_loss:
            # P = sigmoid(np.matmul(X, W.T))
            P = sigmoid(np.matmul(matmul.X, matmul.W.T))
            P = P - T.reshape(-1, 1)  # T(N,) -> T(N,1) to align with P(N,1)
            assert P.shape == (
                N, 1), "P.shape is %s T.shape is %s" % (P.shape, T.shape)

        elif log_loss_function == softmax_cross_entropy_log_loss:
            # matmul.X.shape is (N, D+1), matmul.W.T.shape is (D+1, M)
            P = softmax(np.matmul(matmul.X, matmul.W.T))  # (N, M)
            P[np.arange(N), T] -= 1

        EDX = np.matmul(P / N, matmul.W)  # (N,M) @ (M, D+1) -> (N, D+1)
        EDX = EDX[::, 1:]  # Hide the bias    -> (N, D)
        EDW = np.matmul(matmul.X.T,
                        P / N).T  # ((D+1,N) @ (N, M)).T -> (M, D+1)

        # --------------------------------------------------------------------------------
        # Layer backward path
        # 1. Calculate the analytical gradient dL/dX=matmul.gradient(dL/dY) with a dL/dY.
        # 2. Gradient descent to update Wn+1 = Wn - lr * dL/dX.
        # --------------------------------------------------------------------------------
        before = copy.deepcopy(matmul.W)
        dY = loss.gradient(TYPE_FLOAT(1))
        dX = matmul.gradient(dY)

        # gradient descent and get the analytical gradients dS=[dL/dX, dL/dW]
        # dL/dX.shape = (N, D)
        # dL/dW.shape = (M, D+1)
        dS = matmul.update()
        dW = dS[0]
        # --------------------------------------------------------------------------------
        #  Constraint 1. W in the matmul has been updated by the gradient descent.
        # --------------------------------------------------------------------------------
        Logger.debug("W after is \n%s", matmul.W)
        assert not np.array_equal(before, matmul.W), "W has not been updated."

        if not validate_against_expected_gradient(EDX, dX):
            Logger.warning("Expected dL/dX \n%s\nDiff\n%s", EDX, EDX - dX)
        if not validate_against_expected_gradient(EDW, dW):
            Logger.warning("Expected dL/dW \n%s\nDiff\n%s", EDW, EDW - dW)

        if test_numerical_gradient:
            # --------------------------------------------------------------------------------
            # Numerical gradients gn=[dL/dX, dL/dW]
            # dL/dX.shape = (N, D)
            # dL/dW.shape = (M, D+1)
            # --------------------------------------------------------------------------------
            gn = matmul.gradient_numerical()
            validate_against_numerical_gradient([dX] + dS, gn, Logger)

        if callback:
            # if W.shape[1] == 1 else callback(W=np.average(matmul.W, axis=0))
            callback(W=matmul.W[0])
Exemplo n.º 18
0
    def loss(self, x, t):
        z = self.predict(x)
        y = softmax(z)
        loss = cross_entropy_error(y, t)

        return loss
def test_010_softmax_cross_entropy_log_loss_2d(caplog):
    """
    Objective:
        Test case for softmax_cross_entropy_log_loss(X, T) = -T * log(softmax(X))

        For the input X of shape (N,M) and T in index format of shape (N,),
        calculate the softmax log loss and verify the values are as expected.

    Expected:
        For  P = softmax(X) = exp(-X) / sum(exp(-X))
        _P = P[
          np.arange(N),
          T
        ] selects the probability p for the correct input x.
        Then -log(_P) should be almost same with softmax_cross_entropy_log_loss(X, T).
        Almost because finite float precision always has rounding errors.
    """
    # caplog.set_level(logging.DEBUG, logger=Logger.name)
    u = REFORMULA_DIFF_ACCEPTANCE_VALUE

    # --------------------------------------------------------------------------------
    # [Test case 01]
    # N: Batch size, M: Number of features in X
    # X:(N,M)=(1, 2). X=(x0, x1) where x0 == x1 == 0.5 by which softmax(X) generates equal
    # probability P=(p0, p1) where p0 == p1.
    # Expected:
    #   softmax(X) generates the same with X.
    #   softmax_cross_entropy_log_loss(X, T) == -log(0.5)
    # --------------------------------------------------------------------------------
    X = np.array([[0.5, 0.5]]).astype(TYPE_FLOAT)
    T = np.array([1]).astype(TYPE_LABEL)
    E = -logarithm(np.array([0.5]).astype(TYPE_FLOAT))

    P = softmax(X)
    assert np.array_equal(X, P)

    J, _ = softmax_cross_entropy_log_loss(X, T)
    assert (E.shape == J.shape)
    assert np.all(np.abs(E - J) < u), \
        "Expected abs(E-J) < %s but \n%s\nE=\n%s\nT=%s\nX=\n%s\nJ=\n%s\n" \
        % (u, np.abs(E - J), E, T, X, J)
    assert np.all(np.abs(P - _) < u)

    # --------------------------------------------------------------------------------
    # [Test case 01]
    # For X:(N,M)
    # --------------------------------------------------------------------------------
    for _ in range(NUM_MAX_TEST_TIMES):
        # X(N, M), and T(N,) in index label format
        N = np.random.randint(1, NUM_MAX_BATCH_SIZE)
        M = np.random.randint(2, NUM_MAX_NODES)

        X = np.random.randn(N, M).astype(TYPE_FLOAT)
        T = np.random.randint(0, M, N).astype(TYPE_LABEL)
        Logger.debug("T is %s\nX is \n%s\n", T, X)

        # ----------------------------------------------------------------------
        # Expected value E = -logarithm(_P)
        # ----------------------------------------------------------------------
        P = softmax(X)
        _P = P[np.arange(
            N
        ), T]  # Probability of p for the correct input x, which generates j=-log(p)

        E = -logarithm(_P)

        # ----------------------------------------------------------------------
        # Actual J should be close to E.
        # ----------------------------------------------------------------------
        J, _ = softmax_cross_entropy_log_loss(X, T)
        assert (E.shape == J.shape)
        assert np.all(np.abs(E-J) < u), \
            "Expected abs(E-J) < %s but \n%s\nE=\n%s\nT=%s\nX=\n%s\nJ=\n%s\n" \
            % (u, np.abs(E - J), E, T, X, J)

        # ----------------------------------------------------------------------
        # L = cross_entropy_log_loss(P, T) should be close to J
        # ----------------------------------------------------------------------
        L = cross_entropy_log_loss(P, T)
        assert (L.shape == J.shape)
        assert np.all(np.abs(L-J) < u), \
            "Expected abs(L-J) < %s but \n%s\nL=\n%s\nT=%s\nX=\n%s\nJ=\n%s\n" \
            % (u, np.abs(E - J), E, T, X, J)