def test_010_softmax(): """Test Case for sigmoid """ u = ACTIVATION_DIFF_ACCEPTANCE_VALUE P = softmax(np.array([2.44756739, 2.13945115]).astype(TYPE_FLOAT)) E = np.array([0.57642539, 0.42357461]).astype(TYPE_FLOAT) assert np.all(np.abs(P - E) < u) for _ in range(NUM_MAX_TEST_TIMES): N: int = np.random.randint(1, NUM_MAX_BATCH_SIZE) M: int = np.random.randint(2, NUM_MAX_NODES) X = MAX_ACTIVATION_VALUE * np.random.randn(N, M).astype(TYPE_FLOAT) np.all(np.isfinite(softmax(X)))
def predict(self, x): w1, w2 = self.params['w1'], self.params['w2'] b1, b2 = self.params['b1'], self.params['b2'] a1 = np.dot(x, w1) + b1 z1 = sigmoid(a1) a2 = np.dot(z1, w2) + b2 y = softmax(a2) return y
def predict(network, x): W1, W2, W3 = network['W1'], network['W2'], network['W3'] b1, b2, b3 = network['b1'], network['b2'], network['b3'] a1 = np.dot(x, W1) + b1 z1 = sigmoid(a1) a2 = np.dot(z1, W2) + b2 z2 = sigmoid(a2) a3 = np.dot(z2, W3) + b3 y = softmax(a3) return y
def forward(self, x, t): self.t = t self.y = softmax(x) # one-hot vector to if self.t.size == self.y.size: self.t = self.t.argmax(axis=1) loss = cross_entropy_error(self.y, self.t) return loss
def grad(self, x, t): w1, w2 = self.params['w1'], self.params['w2'] b1, b2 = self.params['b1'], self.params['b2'] grads = {} # forward a1 = np.dot(x, w1) + b1 z1 = sigmoid(a1) a2 = np.dot(z1, w2) + b2 y = softmax(a2) # backward dy = (y - t) / x.shape[0] # softmax with entropy loss's gradient, dL/dy grads['w2'] = np.dot(z1.T, dy) grads['b2'] = np.sum(dy, axis=0) da1 = np.dot(dy, w2.T) dz1 = (1.0 - sigmoid(a1)) * sigmoid(a1) * da1 # sigmoid's gradient grads['w1'] = np.dot(x.T, dz1) grads['b1'] = np.sum(dz1, axis=0) return grads
def predict(self, X): """ Responsibility: Generate a concrete prediction 0/1 for binary classification and an index for categorical classification For binary classification where M=1, 1/True if Xi > 0 else 0. For categorical classification, argmax(X, axis=1) that identifies the class that gives the max probability. Args: X: scores Returns: Predictions """ assert isinstance(X, np.ndarray) and X.dtype == TYPE_FLOAT, \ f"Only np array of type {TYPE_FLOAT} is accepted" if X.ndim <= 1: X = np.array(X).reshape(1, -1) if self._log_loss_function == sigmoid_cross_entropy_log_loss: return (X > 0).astype(TYPE_LABEL) else: return np.argmax(softmax(X), axis=1).astype(TYPE_LABEL)
def test_020_cross_entropy_log_loss_1d(caplog): """ Objective: Test the categorical log loss values for P in 1 dimension. Constraints: 1. The numerical gradient gn = (-t * logarithm(p+h) + t * logarithm(p-h)) / 2h. 2. The numerical gradient gn is within +/- u within the analytical g = -T/P. P: Probabilities from softmax of shape (M,) M: Number of nodes in the cross_entropy_log_loss layer. T: Labels Note: log(P=1) -> 0 dlog(x)/dx = 1/x """ def f(P: np.ndarray, T: np.ndarray): return np.sum(cross_entropy_log_loss(P, T)) # caplog.set_level(logging.DEBUG, logger=Logger.name) h: TYPE_FLOAT = OFFSET_DELTA u: TYPE_FLOAT = GRADIENT_DIFF_ACCEPTANCE_VALUE # -------------------------------------------------------------------------------- # For (P, T): P[index] = True/1, OHE label T[index] = 1 where # P=[0,0,0,...,1,...0], T = [0,0,0,...1,...0]. T[i] == 1 # # Do not forget the Jacobian shape is (N,) and calculate each element. # 1. For T=1, loss L = -log(Pi) = 0 and dL/dP=(1/Pi)= -1 is expected. # 2. For T=0, Loss L = (-log(0+offset+h)-log(0+offset-h)) / 2h = 0 is expected. # -------------------------------------------------------------------------------- M: TYPE_INT = np.random.randint(2, NUM_MAX_NODES) index: TYPE_INT = TYPE_INT(np.random.randint( 0, M)) # Position of the true label in P P1 = np.zeros(M, dtype=TYPE_FLOAT) P1[index] = TYPE_FLOAT(1.0) T1 = np.zeros(M, dtype=TYPE_LABEL) T1[index] = TYPE_LABEL(1) # Analytica correct gradient for P=1, T=1 AG = np.zeros_like(P1, dtype=TYPE_FLOAT) AG[index] = TYPE_FLOAT(-1) # dL/dP = -1 EGN1 = np.zeros_like(P1, dtype=TYPE_FLOAT) # Expected numerical gradient EGN1[index] = (-1 * logarithm(TYPE_FLOAT(1.0 + h)) + TYPE_FLOAT(1) * logarithm(TYPE_FLOAT(1.0 - h))) / TYPE_FLOAT(2 * h) assert np.all(np.abs(EGN1-AG) < u), \ "Expected EGN-1<%s but %s\nEGN=\n%s" % (u, (EGN1-AG), EGN1) GN1 = numerical_jacobian(partial(f, T=T1), P1) assert np.all(np.abs(GN1-AG) < u), \ "Expected GN-1<%s but %s\nGN=\n%s" % (u, (GN1-AG), GN1) # The numerical gradient gn = (-t * logarithm(p+h) + t * logarithm(p-h)) / 2h assert GN1.shape == EGN1.shape assert np.all(np.abs(EGN1-GN1) < u), \ "Expected GN1==EGN1 but GN1-EGN1=\n%sP=\n%s\nT=%s\nEGN=\n%s\nGN=\n%s\n" \ % (np.abs(GN1-EGN1), P1, T1, EGN1, GN1) # The numerical gradient gn is within +/- u within the analytical g = -T/P G1 = np.zeros_like(P1, dtype=TYPE_FLOAT) G1[T1 == 1] = -1 * (T1[index] / P1[index]) # G1[T1 != 0] = 0 check.equal(np.all(np.abs(G1 - GN1) < u), True, "G1-GN1 %s\n" % np.abs(G1 - GN1)) # -------------------------------------------------------------------------------- # For (P, T): P[index] = np uniform(), index label T=index # -------------------------------------------------------------------------------- for _ in range(NUM_MAX_TEST_TIMES): M = np.random.randint(2, NUM_MAX_NODES) # M > 1 T2 = TYPE_LABEL(np.random.randint(0, M)) # location of the truth P2 = np.zeros(M, dtype=TYPE_FLOAT) while not (x := TYPE_FLOAT( np.random.uniform(low=-BOUNDARY_SIGMOID, high=BOUNDARY_SIGMOID))): pass p = softmax(x) P2[T2] = p # -------------------------------------------------------------------------------- # The Jacobian G shape is the same with P.shape. # G:[0, 0, ...,g, 0, ...] where Gi is numerical gradient close to -1/(1+k). # -------------------------------------------------------------------------------- N2 = np.zeros_like(P2, dtype=TYPE_FLOAT) N2[T2] = TYPE_FLOAT(-1) * (logarithm(p + h) - logarithm(p - h)) / TYPE_FLOAT(2 * h) N2 = numerical_jacobian(partial(f, T=T2), P2) # The numerical gradient gn = (-t * logarithm(p+h) + t * logarithm(p-h)) / 2h assert N2.shape == N2.shape assert np.all(np.abs(N2-N2) < u), \ f"Delta expected to be < {u} but \n{np.abs(N2-N2)}" G2 = np.zeros_like(P2, dtype=TYPE_FLOAT) G2[T2] = -1 / p # The numerical gradient gn is within +/- u within the analytical g = -T/P check.equal(np.all(np.abs(G2 - N2) < u), True, "G2-N2 %s\n" % np.abs(G2 - N2))
def test_020_cross_entropy_log_loss_2d(caplog): """ Objective: Test case for cross_entropy_log_loss(X, T) for X:shape(N,M), T:shape(N,) Expected: """ def f(P: np.ndarray, T: np.ndarray): """Loss function""" # For P.ndim==2 of shape (N, M), cross_entropy_log_loss() returns (N,). # Each of which has the loss for P[n]. # If divided by P.shape[0] or N, the loss gets 1/N, which is wrong. # This is not a gradient function but a loss function. # return np.sum(cross_entropy_log_loss(P, T)) / P.shape[0] return np.sum(cross_entropy_log_loss(P, T)) # caplog.set_level(logging.DEBUG, logger=Logger.name) h: TYPE_FLOAT = OFFSET_DELTA u: TYPE_FLOAT = GRADIENT_DIFF_ACCEPTANCE_VALUE for _ in range(NUM_MAX_TEST_TIMES): # -------------------------------------------------------------------------------- # [2D test case] # P:(N, M) is probability matrix where Pnm = p, 0 <=n<N-1, 0<=m<M-1 # T:(N,) is index label where Tn=m is label as integer k.g. m=3 for 3rd label. # Pnm = log(P[i][j]) # L = -log(p), -> dlog(P)/dP -> -1 / (p) # # Keep p value away from 0. As p gets close to 0, the log(p+/-h) gets large e.g # -11.512925464970229, hence log(p+/-h) / 2h explodes. # -------------------------------------------------------------------------------- while not (x := TYPE_FLOAT( np.random.uniform(low=-BOUNDARY_SIGMOID, high=BOUNDARY_SIGMOID))): pass p = softmax(x) N = np.random.randint(1, NUM_MAX_BATCH_SIZE) M = np.random.randint(2, NUM_MAX_NODES) # label index, not OHE T = np.random.randint(0, M, N).astype( TYPE_LABEL) # N rows of labels, max label value is M-1 P = np.zeros((N, M)).astype(TYPE_FLOAT) P[range(N), # Set p at random row position T] = p E = np.zeros_like(P).astype(TYPE_FLOAT) E[range(N), # Set p at random row position T] = (TYPE_FLOAT(-1) * logarithm(p + h) + TYPE_FLOAT(1) * logarithm(p - h)) / (TYPE_FLOAT(2) * h) G = numerical_jacobian(partial(f, T=T), P) assert E.shape == G.shape, \ f"Jacobian shape is expected to be {E.shape} but {G.shape}." assert np.all(np.abs(E-G) < u), \ f"Delta expected to be < {u} but \n{np.abs(E-G)}" A = np.zeros_like(P).astype(TYPE_FLOAT) A[range(N), # Set p at random row position T] = -1 / p check.equal(np.all(np.abs(A - G) < u), True, "A-G %s\n" % np.abs(A - G))
for _ in range(NUM_MAX_TEST_TIMES): # -------------------------------------------------------------------------------- # [1D test case] # P:[0, 0, ..., 1, 0, ...] where Pi = 1 # T:[0, 0, ..., 1, 0, ...] is OHE label where Ti=1 # sum(-t * log(p+k)) -> log(1+k) # dlog(P+k)/dP -> -1 / (1+k) # -------------------------------------------------------------------------------- M = np.random.randint(2, NUM_MAX_NODES) # M > 1 index = np.random.randint(0, M) # location of the truth while not (x := TYPE_FLOAT( np.random.uniform(low=-BOUNDARY_SIGMOID, high=BOUNDARY_SIGMOID))): pass p = softmax(x) P3 = np.zeros(M, dtype=TYPE_FLOAT) P3[index] = p T3 = np.zeros(M).astype(TYPE_LABEL) # OHE index T3[index] = TYPE_LABEL(1) # -------------------------------------------------------------------------------- # The Jacobian G shape is the same with P.shape. # -------------------------------------------------------------------------------- N3 = np.zeros_like(P3, dtype=TYPE_FLOAT) N3[index] = TYPE_FLOAT(-1 * logarithm(p + h) + 1 * logarithm(p - h)) / TYPE_FLOAT(2 * h) N3 = numerical_jacobian(partial(f, T=T3), P3) assert N3.shape == N3.shape assert np.all(np.abs(N3-N3) < u), \ f"Delta expected to be < {u} but \n{np.abs(N3-N3)}"
def forward(self, x, t): self.t = t self.y = softmax(x) self.loss = cross_entropy_error(self.y, self.t) return self.loss
def forward(self, x, **kwargs): self.y = kwargs['y'] self.y_hat = softmax(x) self.loss = cross_entropy_error(self.y, self.y_hat) return self.loss
def disabled_test_040_softmax_log_loss_2d(caplog): """ TODO: Disabled as need to redesign numerical_jacobian for 32 bit floating. Objective: Verify the forward path constraints: 1. Layer output L/loss is np.sum(softmax_cross_entropy_log_loss) / N. 2. gradient_numerical() == numerical_jacobian(objective, X). Verify the backward path constraints: 1. Analytical gradient G: gradient() == (P-1)/N 2. Analytical gradient G is close to GN: gradient_numerical(). """ caplog.set_level(logging.DEBUG) # -------------------------------------------------------------------------------- # Instantiate a CrossEntropyLogLoss layer # -------------------------------------------------------------------------------- name = "test_040_softmax_log_loss_2d_ohe" profiler = cProfile.Profile() profiler.enable() for _ in range(NUM_MAX_TEST_TIMES): N: int = np.random.randint(1, NUM_MAX_BATCH_SIZE) M: int = np.random.randint(2, NUM_MAX_NODES) # number of node > 1 _layer = layer.CrossEntropyLogLoss( name=name, num_nodes=M, log_loss_function=softmax_cross_entropy_log_loss, log_level=logging.DEBUG) # ================================================================================ # Layer forward path # ================================================================================ X = np.random.randn(N, M).astype(TYPE_FLOAT) T = np.zeros_like(X, dtype=TYPE_LABEL) # OHE labels. T[np.arange(N), np.random.randint(0, M, N)] = int(1) # log_loss function require (X, T) in X(N, M), and T(N, M) in index label format. X, T = transform_X_T(X, T) _layer.T = T Logger.debug("%s: X is \n%s\nT is \n%s", name, X, T) # -------------------------------------------------------------------------------- # Expected analytical gradient EG = (dX/dL) = (A-T)/N # -------------------------------------------------------------------------------- A = softmax(X) EG = np.copy(A) EG[np.arange(N), T] -= TYPE_FLOAT( 1) # Shape(N,), subtract from elements for T=1 only EG /= TYPE_FLOAT(N) # -------------------------------------------------------------------------------- # Total loss Z = np.sum(J)/N # Expected loss EL = -sum(T*log(_A)) # (J, P) = softmax_cross_entropy_log_loss(X, T) and J:shape(N,) where J:shape(N,) # is loss for each input and P is activation by sigmoid(X). # -------------------------------------------------------------------------------- L = _layer.function(X) J, P = softmax_cross_entropy_log_loss(X, T) EL = np.array(-np.sum(logarithm(A[np.arange(N), T])) / N, dtype=TYPE_FLOAT) # Constraint: A == P as they are sigmoid(X) assert np.all(np.abs(A-P) < ACTIVATION_DIFF_ACCEPTANCE_VALUE), \ f"Need A==P==sigmoid(X) but A=\n{A}\n P=\n{P}\n(A-P)=\n{(A-P)}\n" # Constraint: Log loss layer output L == sum(J) from the log loss function Z = np.array(np.sum(J) / N, dtype=TYPE_FLOAT) assert np.array_equal(L, Z), \ f"Need log loss layer output L == sum(J) but L=\n{L}\nZ=\n{Z}." # Constraint: L/loss is close to expected loss EL. assert np.all(np.abs(EL-L) < LOSS_DIFF_ACCEPTANCE_VALUE), \ "Need EL close to L but \nEL=\n{EL}\nL=\n{L}\n" # constraint: gradient_numerical() == numerical_jacobian(objective, X) # TODO: compare the diff to accommodate numerical errors. GN = _layer.gradient_numerical() # [dL/dX] from the layer def objective(x): """Function to calculate the scalar loss L for cross entropy log loss""" j, p = softmax_cross_entropy_log_loss(x, T) return np.array(np.sum(j) / N, dtype=TYPE_FLOAT) EGN = numerical_jacobian(objective, X) # Expected numerical dL/dX assert np.array_equal(GN[0], EGN), \ f"GN[0]==EGN expected but GN[0] is \n%s\n EGN is \n%s\n" % (GN[0], EGN) # ================================================================================ # Layer backward path # ================================================================================ # constraint: Analytical gradient G: gradient() == EG == (P-1)/N. dY = TYPE_FLOAT(1) G = _layer.gradient(dY) assert np.all(np.abs(G-EG) <= GRADIENT_DIFF_ACCEPTANCE_VALUE), \ f"Layer gradient dL/dX \n{G} \nneeds to be \n{EG}." # constraint: Analytical gradient G is close to GN: gradient_numerical(). assert \ np.all(np.abs(G - GN[0]) <= GRADIENT_DIFF_ACCEPTANCE_VALUE) or \ np.all(np.abs(G - GN[0]) <= np.abs(GRADIENT_DIFF_ACCEPTANCE_RATIO * GN[0])), \ f"dX is \n{G}\nGN[0] is \n{GN[0]}\nRatio * GN[0] is \n{GRADIENT_DIFF_ACCEPTANCE_RATIO * GN[0]}.\n" # constraint: Gradient g of the log loss layer needs -1 < g < 1 # abs(P-T) = abs(sigmoid(X)-T) cannot be > 1. assert np.all(np.abs(G) < 1), \ f"Log loss layer gradient cannot be < -1 nor > 1 but\n{G}" assert np.all(np.abs(GN[0]) < (1+GRADIENT_DIFF_ACCEPTANCE_RATIO)), \ f"Log loss layer gradient cannot be < -1 nor > 1 but\n{GN[0]}" profiler.disable() profiler.print_stats(sort="cumtime")
def forward(self, x): self.out = softmax(x) return self.out
def validate_relu_neuron_training(matmul: Matmul, activation: ReLU, loss: CrossEntropyLogLoss, X: np.ndarray, T: np.ndarray, num_epochs: int = 100, test_numerical_gradient: bool = False, callback: Callable = None): activation.objective = loss.function matmul.objective = compose(activation.function, loss.function) objective = compose(matmul.function, matmul.objective) num_no_progress: int = 0 # how many time when loss L not decreased. history: List[np.ndarray] = [] loss.T = T for i in range(num_epochs): L = objective(X) N = X.shape[0] P = softmax(relu(np.matmul(matmul.X, matmul.W.T))) EDA = expected_gradient_from_log_loss(P=P, T=T, N=N) # ******************************************************************************** # Constraint: Expected gradients must match actual # ******************************************************************************** validate_relu_neuron_round_trip(matmul=matmul, activation=activation, X=X, dA=EDA) # -------------------------------------------------------------------------------- # gradient descent and get the analytical dL/dX, dL/dW # -------------------------------------------------------------------------------- previous_W = copy.deepcopy(matmul.W) matmul.update() # dL/dX, dL/dW # ******************************************************************************** # Constraint. W in the matmul has been updated by the gradient descent. # ******************************************************************************** Logger.debug("W after is \n%s", matmul.W) if np.array_equal(previous_W, matmul.W): Logger.warning("W has not been updated") # ******************************************************************************** # Constraint: Objective/Loss L(Yn+1) after gradient descent < L(Yn) # ******************************************************************************** if i > 0 and L >= history[-1]: Logger.warning( "Iteration [%i]: Loss[%s] has not improved from the previous [%s] for %s times.", i, L, history[-1], num_no_progress + 1) # -------------------------------------------------------------------------------- # Reduce the learning rate can make the situation worse. # When reduced the lr every time L >= history, the (L >= history) became successive # and eventually exceeded 50 successive non-improvement ending in failure. # Keep the learning rate make the L>=history more frequent but still up to 3 # successive events, and the training still kept progressing. # -------------------------------------------------------------------------------- num_no_progress += 1 if num_no_progress > 5: matmul.lr = matmul.lr * 0.95 if num_no_progress > 50: Logger.error( "The training has no progress more than %s times.", num_no_progress) break else: num_no_progress = 0 history.append(L) if callback: callback(W=matmul.W) return history
def test_030_objective_methods_1d_ohe(): """ Objective: Verify the forward path constraints: 1. Layer output L/loss is np.sum(cross_entropy_log_loss(softmax(X), T)) / N. 2. gradient_numerical() == numerical Jacobian numerical_jacobian(O, X). Verify the backward path constraints: 1. Analytical gradient G: gradient() == (P-1)/N 2. Analytical gradient G is close to GN: gradient_numerical(). Expected: Initialization detects the access to the non-initialized parameters and fails. For X.ndim > 0, the layer transform X into 2D so as to use the numpy tuple- like indexing: P[ (0,3), (2,4) ] Hence, the shape of GN, G are 2D. """ # -------------------------------------------------------------------------------- # Instantiate a CrossEntropyLogLoss layer # -------------------------------------------------------------------------------- name = "test_030_objective_methods_1d_ohe" N = 1 for _ in range(NUM_MAX_TEST_TIMES): M: int = np.random.randint(2, NUM_MAX_NODES) assert M >= 2, "Softmax is for multi label classification. "\ " Use Sigmoid for binary classification." _layer = layer.CrossEntropyLogLoss(name=name, num_nodes=M, log_level=logging.DEBUG) # ================================================================================ # Layer forward path # ================================================================================ X = np.random.randn(M).astype(TYPE_FLOAT) T = np.zeros_like(X, dtype=TYPE_LABEL) # OHE labels. T[np.random.randint(0, M)] = TYPE_LABEL(1) _layer.T = T P = softmax(X) EG = ((P - T) / N).reshape(1, -1).astype( TYPE_FLOAT) # Expected analytical gradient dL/dX = (P-T)/N Logger.debug("%s: X is \n%s\nT is %s\nP is %s\nEG is %s\n", name, X, T, P, EG) # -------------------------------------------------------------------------------- # constraint: L/loss == np.sum(cross_entropy_log_loss(softmax(X), T)) / N. # -------------------------------------------------------------------------------- L = _layer.function(X) Z = np.array(np.sum(cross_entropy_log_loss(softmax(X), T)), dtype=TYPE_FLOAT) / TYPE_FLOAT(N) assert np.array_equal( L, Z), f"SoftmaxLogLoss output should be {L} but {Z}." # -------------------------------------------------------------------------------- # constraint: gradient_numerical() == numerical Jacobian numerical_jacobian(O, X) # Use a dummy _layer for the objective function because using the "_layer" # updates the X, Y which can interfere the independence of the _layer. # -------------------------------------------------------------------------------- GN = _layer.gradient_numerical() # [dL/dX] from the _layer # -------------------------------------------------------------------------------- # Cannot use CrossEntropyLogLoss.function() to simulate the objective function L. # because it causes applying transform_X_T multiple times. # Because internally transform_X_T(X, T) has transformed T into the index label # in 1D with with length 1 by "T = T.reshape(-1)". # Then providing X in 1D into "dummy.function(x)" re-run "transform_X_T(X, T)" # again. The (X.ndim == T.ndim ==1) as an input and T must be OHE label for such # combination and T.shape == P.shape must be true for OHE labels. # However, T has been converted into the index format already by transform_X_T # (applying transform_X_T multiple times) and (T.shape=(1,1), X.shape=(1, > 1) # that violates the (X.shape == T.shape) constraint. # -------------------------------------------------------------------------------- # dummy = CrossEntropyLogLoss( # name="dummy", # num_nodes=M, # log_level=logging.DEBUG # ) # dummy.T = T # dummy.objective = objective # dummy.function(X) # -------------------------------------------------------------------------------- # O = lambda x: dummy.objective(dummy.function(x)) # Objective function O = lambda x: np.sum(cross_entropy_log_loss(softmax(x), T), dtype=TYPE_FLOAT) / TYPE_FLOAT(N) # -------------------------------------------------------------------------------- EGN = numerical_jacobian(O, X).reshape(1, -1) # Expected numerical dL/dX assert np.array_equal(GN[0], EGN), \ f"Layer gradient_numerical GN \n{GN} \nneeds to be \n{EGN}." # ================================================================================ # Layer backward path # ================================================================================ # -------------------------------------------------------------------------------- # constraint: Analytical gradient G: gradient() == (P-1)/N. # -------------------------------------------------------------------------------- dY = TYPE_FLOAT(1) G = _layer.gradient(dY) assert np.all(np.abs(G-EG) <= GRADIENT_DIFF_ACCEPTANCE_VALUE), \ f"Layer gradient dL/dX \n{G} \nneeds to be \n{EG} but G-EG \n{np.abs(G-EG)}\n" # -------------------------------------------------------------------------------- # constraint: Analytical gradient G is close to GN: gradient_numerical(). # -------------------------------------------------------------------------------- assert \ np.all(np.abs(G - GN[0]) <= GRADIENT_DIFF_ACCEPTANCE_VALUE) or \ np.all(np.abs(G-GN[0]) <= np.abs(GRADIENT_DIFF_ACCEPTANCE_RATIO * GN[0])), \ f"dX is \n{G}\nGN[0] is \n{GN[0]}\nRatio * GN[0] is \n{GRADIENT_DIFF_ACCEPTANCE_RATIO * GN[0]}.\n"
def disabled_test_030_objective_methods_2d_ohe(): """ TODO: Disabled as need to redesign numerical_jacobian for 32 bit floating. Objective: Verify the forward path constraints: 1. Layer output L/loss is np.sum(cross_entropy_log_loss(softmax(X), T)) / N. 2. gradient_numerical() == numerical Jacobian numerical_jacobian(O, X). Verify the backward path constraints: 1. Analytical gradient G: gradient() == (P-1)/N 2. Analytical gradient G is close to GN: gradient_numerical(). Expected: Initialization detects the access to the non-initialized parameters and fails. """ def objective(X: np.ndarray) -> Union[float, np.ndarray]: """Dummy objective function to calculate the loss L""" assert X.ndim == 0, "The output of the log loss should be of shape ()" return X # -------------------------------------------------------------------------------- # Instantiate a CrossEntropyLogLoss layer # -------------------------------------------------------------------------------- name = "test_030_objective_methods_2d_ohe" for _ in range(NUM_MAX_TEST_TIMES): N: int = np.random.randint(1, NUM_MAX_BATCH_SIZE) M: int = np.random.randint(2, NUM_MAX_NODES) assert M >= 2, "Softmax is for multi label classification. "\ " Use Sigmoid for binary classification." _layer = layer.CrossEntropyLogLoss(name=name, num_nodes=M, log_level=logging.DEBUG) _layer.objective = objective # ================================================================================ # Layer forward path # ================================================================================ X = np.random.randn(N, M).astype(TYPE_FLOAT) T = np.zeros_like(X, dtype=TYPE_LABEL) # OHE labels. T[np.arange(N), np.random.randint(0, M, N)] = TYPE_LABEL(1) _layer.T = T Logger.debug("%s: X is \n%s\nT is \n%s", name, X, T) P = softmax(X) EG = (P - T) / N # Expected analytical gradient dL/dX = (P-T)/N # -------------------------------------------------------------------------------- # constraint: L/loss == np.sum(cross_entropy_log_loss(softmax(X), T)) / N. # -------------------------------------------------------------------------------- L = _layer.function(X) Z = np.array(np.sum(cross_entropy_log_loss(softmax(X), T))) / N assert np.array_equal( L, Z), f"SoftmaxLogLoss output should be {L} but {Z}." # -------------------------------------------------------------------------------- # constraint: gradient_numerical() == numerical Jacobian numerical_jacobian(O, X) # -------------------------------------------------------------------------------- GN = _layer.gradient_numerical() # [dL/dX] from the _layer # -------------------------------------------------------------------------------- # DO not use CrossEntropyLogLoss.function() to simulate the objective function for # the expected GN. See the same part in test_030_objective_methods_1d_ohe(). # -------------------------------------------------------------------------------- # dummy= CrossEntropyLogLoss( # name=name, # num_nodes=M, # log_level=logging.DEBUG # ) # dummy.T = T # dummy.objective = objective # O = lambda x: dummy.objective(dummy.function(x)) # Objective function O = lambda x: np.sum(cross_entropy_log_loss(softmax(x), T)) / N # -------------------------------------------------------------------------------- EGN = numerical_jacobian(O, X) # Expected numerical dL/dX assert np.array_equal(GN[0], EGN), \ f"GN[0]==EGN expected but GN[0] is \n%s\n EGN is \n%s\n" % (GN[0], EGN) # ================================================================================ # Layer backward path # ================================================================================ # -------------------------------------------------------------------------------- # constraint: Analytical gradient G: gradient() == (P-1)/N. # -------------------------------------------------------------------------------- dY = float(1) G = _layer.gradient(dY) assert np.all(abs(G-EG) <= GRADIENT_DIFF_ACCEPTANCE_VALUE), \ f"Layer gradient dL/dX \n{G} \nneeds to be \n{EG}." # -------------------------------------------------------------------------------- # constraint: Analytical gradient G is close to GN: gradient_numerical(). # -------------------------------------------------------------------------------- assert \ np.all(np.abs(G - GN[0]) <= GRADIENT_DIFF_ACCEPTANCE_VALUE) or \ np.all(np.abs(G - GN[0]) <= np.abs(GRADIENT_DIFF_ACCEPTANCE_RATIO * GN[0])), \ f"dX is \n{G}\nGN[0] is \n{GN[0]}\nRatio * GN[0] is \n{GRADIENT_DIFF_ACCEPTANCE_RATIO * GN[0]}.\n"
def train_binary_classifier(N: int, D: int, M: int, X: np.ndarray, T: np.ndarray, W: np.ndarray, log_loss_function: Callable, optimizer: Optimizer, num_epochs: int = 100, test_numerical_gradient: bool = False, log_level: int = logging.ERROR, callback: Callable = None): """Test case for binary classification with matmul + log loss. Args: N: Batch size D: Number of features M: Number of nodes. 1 for sigmoid and 2 for softmax X: train data T: labels W: weight log_loss_function: cross entropy logg loss function optimizer: Optimizer num_epochs: Number of epochs to run test_numerical_gradient: Flag if test the analytical gradient with the numerical one. log_level: logging level callback: callback function to invoke at the each epoch end. """ name = __name__ assert isinstance(T, np.ndarray) and np.issubdtype( T.dtype, np.integer) and T.ndim == 1 and T.shape[0] == N assert isinstance( X, np.ndarray) and X.dtype == TYPE_FLOAT and X.ndim == 2 and X.shape[ 0] == N and X.shape[1] == D assert isinstance( W, np.ndarray) and W.dtype == TYPE_FLOAT and W.ndim == 2 and W.shape[ 0] == M and W.shape[1] == D + 1 assert num_epochs > 0 and N > 0 and D > 0 assert ((log_loss_function == sigmoid_cross_entropy_log_loss and M == 1) or (log_loss_function == softmax_cross_entropy_log_loss and M >= 2)) # -------------------------------------------------------------------------------- # Instantiate a CrossEntropyLogLoss layer # -------------------------------------------------------------------------------- loss = CrossEntropyLogLoss(name="loss", num_nodes=M, log_loss_function=log_loss_function, log_level=log_level) # -------------------------------------------------------------------------------- # Instantiate a Matmul layer # -------------------------------------------------------------------------------- matmul = Matmul(name="matmul", num_nodes=M, W=W, optimizer=optimizer, log_level=log_level) matmul.objective = loss.function num_no_progress: int = 0 # how many time when loss L not decreased. loss.T = T history: List[np.ndarray] = [loss.function(matmul.function(X))] for i in range(num_epochs): # -------------------------------------------------------------------------------- # Layer forward path # Calculate the matmul output Y=f(X), and get the loss L = objective(Y) # Test the numerical gradient dL/dX=matmul.gradient_numerical(). # -------------------------------------------------------------------------------- Y = matmul.function(X) L = loss.function(Y) if not (i % 50): print(f"iteration {i} Loss {L}") Logger.info("%s: iteration[%s]. Loss is [%s]", name, i, L) # -------------------------------------------------------------------------------- # Constraint: 1. Objective/Loss L(Yn+1) after gradient descent < L(Yn) # -------------------------------------------------------------------------------- if L >= history[-1] and (i % 20) == 1: Logger.warning( "Iteration [%i]: Loss[%s] has not improved from the previous [%s].", i, L, history[-1]) if (num_no_progress := num_no_progress + 1) > 20: Logger.error( "The training has no progress more than %s times.", num_no_progress) # break else: num_no_progress = 0 history.append(L) # -------------------------------------------------------------------------------- # Expected dL/dW.T = X.T @ dL/dY = X.T @ (P-T) / N, and dL/dX = dL/dY @ W # P = sigmoid(X) or softmax(X) # dL/dX = dL/dY * W is to use W BEFORE updating W. # -------------------------------------------------------------------------------- P = None if log_loss_function == sigmoid_cross_entropy_log_loss: # P = sigmoid(np.matmul(X, W.T)) P = sigmoid(np.matmul(matmul.X, matmul.W.T)) P = P - T.reshape(-1, 1) # T(N,) -> T(N,1) to align with P(N,1) assert P.shape == ( N, 1), "P.shape is %s T.shape is %s" % (P.shape, T.shape) elif log_loss_function == softmax_cross_entropy_log_loss: # matmul.X.shape is (N, D+1), matmul.W.T.shape is (D+1, M) P = softmax(np.matmul(matmul.X, matmul.W.T)) # (N, M) P[np.arange(N), T] -= 1 EDX = np.matmul(P / N, matmul.W) # (N,M) @ (M, D+1) -> (N, D+1) EDX = EDX[::, 1:] # Hide the bias -> (N, D) EDW = np.matmul(matmul.X.T, P / N).T # ((D+1,N) @ (N, M)).T -> (M, D+1) # -------------------------------------------------------------------------------- # Layer backward path # 1. Calculate the analytical gradient dL/dX=matmul.gradient(dL/dY) with a dL/dY. # 2. Gradient descent to update Wn+1 = Wn - lr * dL/dX. # -------------------------------------------------------------------------------- before = copy.deepcopy(matmul.W) dY = loss.gradient(TYPE_FLOAT(1)) dX = matmul.gradient(dY) # gradient descent and get the analytical gradients dS=[dL/dX, dL/dW] # dL/dX.shape = (N, D) # dL/dW.shape = (M, D+1) dS = matmul.update() dW = dS[0] # -------------------------------------------------------------------------------- # Constraint 1. W in the matmul has been updated by the gradient descent. # -------------------------------------------------------------------------------- Logger.debug("W after is \n%s", matmul.W) assert not np.array_equal(before, matmul.W), "W has not been updated." if not validate_against_expected_gradient(EDX, dX): Logger.warning("Expected dL/dX \n%s\nDiff\n%s", EDX, EDX - dX) if not validate_against_expected_gradient(EDW, dW): Logger.warning("Expected dL/dW \n%s\nDiff\n%s", EDW, EDW - dW) if test_numerical_gradient: # -------------------------------------------------------------------------------- # Numerical gradients gn=[dL/dX, dL/dW] # dL/dX.shape = (N, D) # dL/dW.shape = (M, D+1) # -------------------------------------------------------------------------------- gn = matmul.gradient_numerical() validate_against_numerical_gradient([dX] + dS, gn, Logger) if callback: # if W.shape[1] == 1 else callback(W=np.average(matmul.W, axis=0)) callback(W=matmul.W[0])
def loss(self, x, t): z = self.predict(x) y = softmax(z) loss = cross_entropy_error(y, t) return loss
def test_010_softmax_cross_entropy_log_loss_2d(caplog): """ Objective: Test case for softmax_cross_entropy_log_loss(X, T) = -T * log(softmax(X)) For the input X of shape (N,M) and T in index format of shape (N,), calculate the softmax log loss and verify the values are as expected. Expected: For P = softmax(X) = exp(-X) / sum(exp(-X)) _P = P[ np.arange(N), T ] selects the probability p for the correct input x. Then -log(_P) should be almost same with softmax_cross_entropy_log_loss(X, T). Almost because finite float precision always has rounding errors. """ # caplog.set_level(logging.DEBUG, logger=Logger.name) u = REFORMULA_DIFF_ACCEPTANCE_VALUE # -------------------------------------------------------------------------------- # [Test case 01] # N: Batch size, M: Number of features in X # X:(N,M)=(1, 2). X=(x0, x1) where x0 == x1 == 0.5 by which softmax(X) generates equal # probability P=(p0, p1) where p0 == p1. # Expected: # softmax(X) generates the same with X. # softmax_cross_entropy_log_loss(X, T) == -log(0.5) # -------------------------------------------------------------------------------- X = np.array([[0.5, 0.5]]).astype(TYPE_FLOAT) T = np.array([1]).astype(TYPE_LABEL) E = -logarithm(np.array([0.5]).astype(TYPE_FLOAT)) P = softmax(X) assert np.array_equal(X, P) J, _ = softmax_cross_entropy_log_loss(X, T) assert (E.shape == J.shape) assert np.all(np.abs(E - J) < u), \ "Expected abs(E-J) < %s but \n%s\nE=\n%s\nT=%s\nX=\n%s\nJ=\n%s\n" \ % (u, np.abs(E - J), E, T, X, J) assert np.all(np.abs(P - _) < u) # -------------------------------------------------------------------------------- # [Test case 01] # For X:(N,M) # -------------------------------------------------------------------------------- for _ in range(NUM_MAX_TEST_TIMES): # X(N, M), and T(N,) in index label format N = np.random.randint(1, NUM_MAX_BATCH_SIZE) M = np.random.randint(2, NUM_MAX_NODES) X = np.random.randn(N, M).astype(TYPE_FLOAT) T = np.random.randint(0, M, N).astype(TYPE_LABEL) Logger.debug("T is %s\nX is \n%s\n", T, X) # ---------------------------------------------------------------------- # Expected value E = -logarithm(_P) # ---------------------------------------------------------------------- P = softmax(X) _P = P[np.arange( N ), T] # Probability of p for the correct input x, which generates j=-log(p) E = -logarithm(_P) # ---------------------------------------------------------------------- # Actual J should be close to E. # ---------------------------------------------------------------------- J, _ = softmax_cross_entropy_log_loss(X, T) assert (E.shape == J.shape) assert np.all(np.abs(E-J) < u), \ "Expected abs(E-J) < %s but \n%s\nE=\n%s\nT=%s\nX=\n%s\nJ=\n%s\n" \ % (u, np.abs(E - J), E, T, X, J) # ---------------------------------------------------------------------- # L = cross_entropy_log_loss(P, T) should be close to J # ---------------------------------------------------------------------- L = cross_entropy_log_loss(P, T) assert (L.shape == J.shape) assert np.all(np.abs(L-J) < u), \ "Expected abs(L-J) < %s but \n%s\nL=\n%s\nT=%s\nX=\n%s\nJ=\n%s\n" \ % (u, np.abs(E - J), E, T, X, J)