def disabled_test_040_objective_methods_2d_ohe(caplog): """ TODO: Disabled as need to redesign numerical_jacobian for 32 bit floating. Objective: Verify the forward path constraints: 1. Layer output L/loss is np.sum(sigmoid_cross_entropy_log_loss) / N. 2. gradient_numerical() == numerical Jacobian numerical_jacobian(O, X). Verify the backward path constraints: 1. Analytical gradient G: gradient() == (P-1)/N 2. Analytical gradient G is close to GN: gradient_numerical(). """ caplog.set_level(logging.DEBUG) # -------------------------------------------------------------------------------- # Instantiate a CrossEntropyLogLoss layer # -------------------------------------------------------------------------------- name = "test_040_objective_methods_2d_ohe" profiler = cProfile.Profile() profiler.enable() for _ in range(NUM_MAX_TEST_TIMES): N: int = np.random.randint(1, NUM_MAX_BATCH_SIZE) M: int = 1 # node number is 1 for 0/1 binary classification. layer = CrossEntropyLogLoss( name=name, num_nodes=M, log_loss_function=sigmoid_cross_entropy_log_loss, log_level=logging.DEBUG) # ================================================================================ # Layer forward path # ================================================================================ X = np.random.randn(N, M).astype(TYPE_FLOAT) T = np.zeros_like(X, dtype=TYPE_LABEL) # OHE labels. T[np.arange(N), np.random.randint(0, M, N)] = TYPE_LABEL(1) # log_loss function require (X, T) in X(N, M), and T(N, M) in OHE label format. X, T = transform_X_T(X, T) layer.T = T Logger.debug("%s: X is \n%s\nT is \n%s", name, X, T) # -------------------------------------------------------------------------------- # Expected analytical gradient EG = (dX/dL) = (A-T)/N # -------------------------------------------------------------------------------- A = sigmoid(X) EG = ((A - T).astype(TYPE_FLOAT) / TYPE_FLOAT(N)) # -------------------------------------------------------------------------------- # Total loss Z = np.sum(J)/N # Expected loss EL = sum((1-T)X + np.log(1 + np.exp(-X))) # (J, P) = sigmoid_cross_entropy_log_loss(X, T) and J:shape(N,) where J:shape(N,) # is loss for each input and P is activation by sigmoid(X). # -------------------------------------------------------------------------------- L = layer.function(X) J, P = sigmoid_cross_entropy_log_loss(X, T) EL = np.array(np.sum((1 - T) * X + logarithm(1 + np.exp(-X))) / N, dtype=TYPE_FLOAT) # Constraint: A == P as they are sigmoid(X) assert np.all(np.abs(A-P) < ACTIVATION_DIFF_ACCEPTANCE_VALUE), \ f"Need A==P==sigmoid(X) but A=\n{A}\n P=\n{P}\n(A-P)=\n{(A-P)}\n" # Constraint: Log loss layer output L == sum(J) from the log loss function Z = np.array(np.sum(J) / N, dtype=TYPE_FLOAT) assert np.array_equal(L, Z), \ f"Need log loss layer output L == sum(J) but L=\n{L}\nZ=\n{Z}." # Constraint: L/loss is close to expected loss EL. assert np.all(np.abs(EL-L) < LOSS_DIFF_ACCEPTANCE_VALUE), \ "Need EL close to L but \nEL=\n{EL}\nL=\n{L}\n" # -------------------------------------------------------------------------------- # constraint: gradient_numerical() == numerical_jacobian(objective, X) # TODO: compare the diff to accommodate numerical errors. # -------------------------------------------------------------------------------- GN = layer.gradient_numerical() # [dL/dX] from the layer def objective(x): """Function to calculate the scalar loss L for cross entropy log loss""" j, p = sigmoid_cross_entropy_log_loss(x, T) return np.array(np.sum(j) / N, dtype=TYPE_FLOAT) EGN = numerical_jacobian(objective, X) # Expected numerical dL/dX assert np.array_equal(GN[0], EGN), \ f"GN[0]==EGN expected but GN[0] is \n%s\n EGN is \n%s\n" % (GN[0], EGN) # ================================================================================ # Layer backward path # ================================================================================ # constraint: Analytical gradient G: gradient() == (P-1)/N. dY = TYPE_FLOAT(1) G = layer.gradient(dY) assert np.all(np.abs(G-EG) <= GRADIENT_DIFF_ACCEPTANCE_VALUE), \ f"Layer gradient dL/dX \n{G} \nneeds to be \n{EG}." # constraint: Analytical gradient G is close to GN: gradient_numerical(). assert \ np.allclose(GN[0], G, atol=GRADIENT_DIFF_ACCEPTANCE_VALUE, rtol=GRADIENT_DIFF_ACCEPTANCE_RATIO), \ f"dX is \n{G}\nGN[0] is \n{GN[0]}\nRDiff is \n{G-GN[0]}.\n" # constraint: Gradient g of the log loss layer needs -1 < g < 1 # abs(P-T) = abs(sigmoid(X)-T) cannot be > 1. assert np.all(np.abs(G) < 1), \ f"Log loss layer gradient cannot be < -1 nor > 1 but\n{G}" assert np.all(np.abs(GN[0]) < (1+GRADIENT_DIFF_ACCEPTANCE_RATIO)), \ f"Log loss layer gradient cannot be < -1 nor > 1 but\n{GN[0]}" profiler.disable() profiler.print_stats(sort="cumtime")
def disabled_test_040_objective_methods_1d_ohe(): """ TODO: Disabled as need to redesign numerical_jacobian for 32 bit floating. Objective: Verify the forward path constraints: 1. Layer output L/loss is np.sum(cross_entropy_log_loss(sigmoid(X), T, f=logistic_log_loss))) / N. 2. gradient_numerical() == numerical Jacobian numerical_jacobian(O, X). Verify the backward path constraints: 1. Analytical gradient G: gradient() == (P-1)/N 2. Analytical gradient G is close to GN: gradient_numerical(). Expected: Initialization detects the access to the non-initialized parameters and fails. For X.ndim > 0, the layer transform X into 2D so as to use the numpy tuple- like indexing: P[ (0,3), (2,4) ] Hence, the shape of GN, G are 2D. """ # -------------------------------------------------------------------------------- # Instantiate a CrossEntropyLogLoss layer # -------------------------------------------------------------------------------- name = "test_040_objective_methods_1d_ohe" N = 1 for _ in range(NUM_MAX_TEST_TIMES): layer = CrossEntropyLogLoss( name=name, num_nodes=1, log_loss_function=sigmoid_cross_entropy_log_loss, log_level=logging.DEBUG) # ================================================================================ # Layer forward path # ================================================================================ X = TYPE_FLOAT( np.random.uniform(low=-BOUNDARY_SIGMOID, high=BOUNDARY_SIGMOID)) T = TYPE_LABEL(np.random.randint(0, 2)) # OHE labels. # log_loss function require (X, T) in X(N, M), and T(N, M) in OHE label format. X, T = transform_X_T(X, T) layer.T = T # Expected analytical gradient dL/dX = (P-T)/N of shape (N,M) A = sigmoid(X) EG = ((A - T) / N).reshape(1, -1).astype(TYPE_FLOAT) Logger.debug("%s: X is \n%s\nT is %s\nP is %s\nEG is %s\n", name, X, T, A, EG) # -------------------------------------------------------------------------------- # constraint: L/loss == np.sum(J) / N. # J, P = sigmoid_cross_entropy_log_loss(X, T) # -------------------------------------------------------------------------------- L = layer.function(X) # L is shape () J, P = sigmoid_cross_entropy_log_loss(X, T) Z = np.array(np.sum(J), dtype=TYPE_FLOAT) / TYPE_FLOAT(N) assert np.array_equal(L, Z), f"LogLoss output should be {L} but {Z}." # -------------------------------------------------------------------------------- # constraint: gradient_numerical() == numerical Jacobian numerical_jacobian(O, X) # Use a dummy layer for the objective function because using the "layer" # updates the X, Y which can interfere the independence of the layer. # -------------------------------------------------------------------------------- GN = layer.gradient_numerical() # [dL/dX] from the layer # -------------------------------------------------------------------------------- # Cannot use CrossEntropyLogLoss.function() to simulate the objective function L. # because it causes applying transform_X_T multiple times. # Because internally transform_X_T(X, T) has transformed T into the index label # in 1D with with length 1 by "T = T.reshape(-1)". # Then providing X in 1D into "dummy.function(x)" re-run "transform_X_T(X, T)" # again. The (X.ndim == T.ndim ==1) as an input and T must be OHE label for such # combination and T.shape == P.shape must be true for OHE labels. # However, T has been converted into the index format already by transform_X_T # (applying transform_X_T multiple times) and (T.shape=(1,1), X.shape=(1, > 1) # that violates the (X.shape == T.shape) constraint. # -------------------------------------------------------------------------------- # dummy = CrossEntropyLogLoss( # name="dummy", # num_nodes=M, # log_level=logging.DEBUG # ) # dummy.T = T # dummy.objective = objective # dummy.function(X) # -------------------------------------------------------------------------------- def objective(x): j, p = sigmoid_cross_entropy_log_loss(x, T) return np.array(np.sum(j) / N, dtype=TYPE_FLOAT) EGN = numerical_jacobian(objective, X).reshape(1, -1) # Expected numerical dL/dX assert np.array_equal(GN[0], EGN), \ f"Layer gradient_numerical GN \n{GN} \nneeds to be \n{EGN}." # ================================================================================ # Layer backward path # ================================================================================ # -------------------------------------------------------------------------------- # constraint: Analytical gradient G: gradient() == (P-1)/N. # -------------------------------------------------------------------------------- dY = TYPE_FLOAT(1) G = layer.gradient(dY) assert np.all(np.abs(G-EG) <= GRADIENT_DIFF_ACCEPTANCE_VALUE), \ f"Layer gradient dL/dX \n{G} \nneeds to be \n{EG}." # -------------------------------------------------------------------------------- # constraint: Analytical gradient G is close to GN: gradient_numerical(). # -------------------------------------------------------------------------------- assert \ np.all(np.abs(G-GN[0]) <= GRADIENT_DIFF_ACCEPTANCE_VALUE) or \ np.all(np.abs(G-GN[0]) <= np.abs(GRADIENT_DIFF_ACCEPTANCE_RATIO * GN[0])), \ "dX is \n%s\nGN is \n%s\nG-GN is \n%s\n Ratio * GN[0] is \n%s.\n" \ % (G, GN[0], G-GN[0], GRADIENT_DIFF_ACCEPTANCE_RATIO * GN[0])