def test_040_objective_instantiation_to_fail(): """ Objective: Verify the layer class validates the initialization parameter constraints. Expected: Initialization detects parameter constraints not meet and fails. """ profiler = cProfile.Profile() profiler.enable() for _ in range(NUM_MAX_TEST_TIMES): M: int = np.random.randint(1, NUM_MAX_NODES) # Constraint: Name is string with length > 0. try: CrossEntropyLogLoss( name="", num_nodes=1, log_loss_function=sigmoid_cross_entropy_log_loss) raise RuntimeError( "CrossEntropyLogLoss initialization with invalid name must fail" ) except AssertionError: pass # Constraint: num_nodes == 1 try: CrossEntropyLogLoss( name="test_040_objective", num_nodes=0, log_loss_function=sigmoid_cross_entropy_log_loss) raise RuntimeError("CrossEntropyLogLoss(num_nodes<1) must fail.") except AssertionError: pass try: CrossEntropyLogLoss( name="test_040_objective", num_nodes=np.random.randint(2, NUM_MAX_NODES), log_loss_function=sigmoid_cross_entropy_log_loss) raise RuntimeError("CrossEntropyLogLoss(num_nodes>1) must fail.") except AssertionError: pass # Constraint: logging level is correct. try: CrossEntropyLogLoss( name="test_040_objective", num_nodes=M, log_loss_function=sigmoid_cross_entropy_log_loss, log_level=-1) raise RuntimeError( "CrossEntropyLogLoss initialization with invalid log level must fail" ) except (AssertionError, KeyError): pass profiler.disable() profiler.print_stats(sort="cumtime")
def output(m, d): return { "matmul": Matmul.specification( name="matmul", num_nodes=m, num_features=d, weights_initialization_scheme="he", weights_optimizer_specification=optimizer.SGD.specification( lr=0.05, l2=1e-3)), "loss": CrossEntropyLogLoss.specification(name="loss", num_nodes=m) }
def test(): M = 1 D = 2 N = 100 X, T, V = linear_separable(d=D, n=N) x_min, x_max = X[:, 0].min(), X[:, 0].max() y_min, y_max = X[:, 1].min(), X[:, 1].max() sigmoid_classifier_specification = { _NAME: "softmax_classifier", _NUM_NODES: M, _LOG_LEVEL: logging.ERROR, _COMPOSITE_LAYER_SPEC: { "matmul01": Matmul.specification( name="matmul", num_nodes=M, num_features=D, weights_initialization_scheme="he", weights_optimizer_specification=SGD.specification( lr=TYPE_FLOAT(0.2), l2=TYPE_FLOAT(1e-3))), "loss": CrossEntropyLogLoss.specification( name="loss", num_nodes=M, loss_function=sigmoid_cross_entropy_log_loss.__qualname__) } } logistic_classifier = SequentialNetwork.build( specification=sigmoid_classifier_specification, ) for i in range(50): logistic_classifier.train(X=X, T=T) prediction = logistic_classifier.predict( np.array([-1., -1.], dtype=TYPE_FLOAT)) np.isin(prediction, [0, 1]) print(prediction)
def train_matmul_bn_relu_classifier(N: int, D: int, M: int, X: np.ndarray, T: np.ndarray, W: np.ndarray, log_loss_function: Callable, optimizer: Optimizer, num_epochs: int = 100, test_numerical_gradient: bool = False, log_level: int = logging.ERROR, callback: Callable = None): """Test case for binary classification with matmul + log loss. Args: N: Batch size D: Number of features M: Number of nodes. 1 for sigmoid and 2 for softmax X: train data T: labels W: weight log_loss_function: cross entropy logg loss function optimizer: Optimizer num_epochs: Number of epochs to run test_numerical_gradient: Flag if test the analytical gradient with the numerical one. log_level: logging level callback: callback function to invoke at the each epoch end. """ name = __name__ assert isinstance(T, np.ndarray) and np.issubdtype( T.dtype, np.integer) and T.ndim == 1 and T.shape[0] == N assert isinstance( X, np.ndarray) and X.dtype == TYPE_FLOAT and X.ndim == 2 and X.shape[ 0] == N and X.shape[1] == D assert isinstance( W, np.ndarray) and W.dtype == TYPE_FLOAT and W.ndim == 2 and W.shape[ 0] == M and W.shape[1] == D + 1 assert num_epochs > 0 and N > 0 and D > 0 assert (log_loss_function == softmax_cross_entropy_log_loss and M >= 2) # -------------------------------------------------------------------------------- # Instantiate a CrossEntropyLogLoss layer # -------------------------------------------------------------------------------- loss: CrossEntropyLogLoss = CrossEntropyLogLoss( name="loss", num_nodes=M, log_loss_function=log_loss_function, log_level=log_level) # -------------------------------------------------------------------------------- # Instantiate a ReLU layer # -------------------------------------------------------------------------------- activation: ReLU = ReLU(name="relu", num_nodes=M, log_level=log_level) activation.objective = loss.function # -------------------------------------------------------------------------------- # Instantiate a Matmul layer # -------------------------------------------------------------------------------- bn: BatchNormalization = BatchNormalization(name=name, num_nodes=M, log_level=logging.WARNING) bn.objective = compose(activation.function, activation.objective) # -------------------------------------------------------------------------------- # Instantiate a Matmul layer # -------------------------------------------------------------------------------- matmul: Matmul = Matmul(name="matmul", num_nodes=M, W=W, optimizer=optimizer, log_level=log_level) matmul.objective = compose(bn.function, bn.objective) # -------------------------------------------------------------------------------- # Instantiate a Normalization layer # Need to apply the same mean and std to the non-training data set. # -------------------------------------------------------------------------------- # norm = Standardization( # name="standardization", # num_nodes=M, # log_level=log_level # ) # X = np.copy(X) # X = norm.function(X) # Network objective function f: L=f(X) objective = compose(matmul.function, matmul.objective) prediction = compose(matmul.predict, bn.predict, activation.predict) num_no_progress: int = 0 # how many time when loss L not decreased. loss.T = T # pylint: disable=not-callable history: List[np.ndarray] = [matmul.objective(matmul.function(X))] for i in range(num_epochs): # -------------------------------------------------------------------------------- # Layer forward path # 1. Calculate the matmul output Y=matmul.f(X) # 2. Calculate the ReLU output A=activation.f(Y) # 3. Calculate the loss L = loss(A) # Test the numerical gradient dL/dX=matmul.gradient_numerical(). # -------------------------------------------------------------------------------- Y = matmul.function(X) BN = bn.function(Y) A = activation.function(BN) L = loss.function(A) # ******************************************************************************** # Constraint: Network objective L must match layer-by-layer output # ******************************************************************************** # pylint: disable=not-callable assert L == objective(X) and L.shape == (), \ f"Network objective L(X) %s must match layer-by-layer output %s." \ % (objective(X), L) if not (i % 10): print(f"iteration {i} Loss {L}") Logger.info("%s: iteration[%s]. Loss is [%s]", name, i, L) # ******************************************************************************** # Constraint: Objective/Loss L(Yn+1) after gradient descent < L(Yn) # ******************************************************************************** if L >= history[-1] and i > 0: Logger.warning( "Iteration [%i]: Loss[%s] has not improved from the previous [%s] for %s times.", i, L, history[-1], num_no_progress + 1) # -------------------------------------------------------------------------------- # Reduce the learning rate can make the situation worse. # When reduced the lr every time L >= history, the (L >= history) became successive # and eventually exceeded 50 successive non-improvement ending in failure. # Keep the learning rate make the L>=history more frequent but still up to 3 # successive events, and the training still kept progressing. # -------------------------------------------------------------------------------- num_no_progress += 1 if num_no_progress > 5: matmul.lr = matmul.lr * TYPE_FLOAT(0.95) if num_no_progress > 50: Logger.error( "The training has no progress more than %s times.", num_no_progress) break else: num_no_progress = 0 history.append(L) # ================================================================================ # Layer backward path # 1. Calculate the analytical gradient dL/dX=matmul.gradient(dL/dY) with a dL/dY. # 2. Gradient descent to update Wn+1 = Wn - lr * dL/dX. # ================================================================================ before = copy.deepcopy(matmul.W) dA = loss.gradient(TYPE_FLOAT(1)) # dL/dA dBN = activation.gradient(dA) # dL/dBN dY = bn.gradient(dBN) # dL/dY dX = matmul.gradient(dY) # dL/dX # gradient descent and get the analytical gradients bn.update() dS = matmul.update() # dL/dX, dL/dW # ******************************************************************************** # Constraint. W in the matmul has been updated by the gradient descent. # ******************************************************************************** Logger.debug("W after is \n%s", matmul.W) assert not np.array_equal(before, matmul.W), "W has not been updated." if test_numerical_gradient: # -------------------------------------------------------------------------------- # Numerical gradient # -------------------------------------------------------------------------------- gn = matmul.gradient_numerical() validate_against_numerical_gradient([dX] + dS, gn, Logger) # prepend dL/dX if callback: # if W.shape[1] == 1 else callback(W=np.average(matmul.W, axis=0)) callback(W=matmul.W) return matmul.W, objective, prediction
def test_040_objective_instance_properties(): """ Objective: Verify the layer class validates the parameters have been initialized before accessed. Expected: Initialization detects the access to the non-initialized parameters and fails. """ msg = "Accessing uninitialized property of the layer must fail." name = random_string(np.random.randint(1, 10)) for _ in range(NUM_MAX_TEST_TIMES): M: int = 1 layer = CrossEntropyLogLoss( name=name, num_nodes=1, log_loss_function=sigmoid_cross_entropy_log_loss, log_level=logging.DEBUG) # -------------------------------------------------------------------------------- # To pass # -------------------------------------------------------------------------------- try: if not layer.name == name: raise RuntimeError("layer.name == name should be true") except AssertionError: raise RuntimeError( "Access to name should be allowed as already initialized.") try: if not layer.M == M: raise RuntimeError("layer.M == M should be true") except AssertionError: raise RuntimeError( "Access to M should be allowed as already initialized.") try: if not isinstance(layer.logger, logging.Logger): raise RuntimeError( "isinstance(layer.logger, logging.Logger) should be true") except AssertionError: raise RuntimeError( "Access to logger should be allowed as already initialized.") # -------------------------------------------------------------------------------- # To fail # -------------------------------------------------------------------------------- try: print(layer.X) raise RuntimeError(msg) except AssertionError: pass try: layer.X = int(1) raise RuntimeError(msg) except AssertionError: pass try: print(layer.N) raise RuntimeError(msg) except AssertionError: pass try: print(layer.dX) raise RuntimeError(msg) except AssertionError: pass try: print(layer.Y) raise RuntimeError(msg) except AssertionError: pass try: print(layer.P) raise RuntimeError(msg) except AssertionError: pass try: layer._Y = int(1) print(layer.Y) raise RuntimeError(msg) except AssertionError: pass try: print(layer.dY) raise RuntimeError(msg) except AssertionError: pass try: layer._dY = int(1) print(layer.dY) raise RuntimeError(msg) except AssertionError: pass try: print(layer.T) raise RuntimeError(msg) except AssertionError: pass try: print(layer.L) raise RuntimeError(msg) except AssertionError: pass try: print(layer.J) raise RuntimeError(msg) except AssertionError: pass try: layer.T = float(1) raise RuntimeError(msg) except AssertionError: pass try: layer.function(int(1)) raise RuntimeError("Invoke layer.function(int(1)) must fail.") except AssertionError: pass try: layer.function(TYPE_FLOAT(1.0)) layer.gradient(int(1)) raise RuntimeError("Invoke layer.gradient(int(1)) must fail.") except AssertionError: pass
def disabled_test_040_objective_methods_2d_ohe(caplog): """ TODO: Disabled as need to redesign numerical_jacobian for 32 bit floating. Objective: Verify the forward path constraints: 1. Layer output L/loss is np.sum(sigmoid_cross_entropy_log_loss) / N. 2. gradient_numerical() == numerical Jacobian numerical_jacobian(O, X). Verify the backward path constraints: 1. Analytical gradient G: gradient() == (P-1)/N 2. Analytical gradient G is close to GN: gradient_numerical(). """ caplog.set_level(logging.DEBUG) # -------------------------------------------------------------------------------- # Instantiate a CrossEntropyLogLoss layer # -------------------------------------------------------------------------------- name = "test_040_objective_methods_2d_ohe" profiler = cProfile.Profile() profiler.enable() for _ in range(NUM_MAX_TEST_TIMES): N: int = np.random.randint(1, NUM_MAX_BATCH_SIZE) M: int = 1 # node number is 1 for 0/1 binary classification. layer = CrossEntropyLogLoss( name=name, num_nodes=M, log_loss_function=sigmoid_cross_entropy_log_loss, log_level=logging.DEBUG) # ================================================================================ # Layer forward path # ================================================================================ X = np.random.randn(N, M).astype(TYPE_FLOAT) T = np.zeros_like(X, dtype=TYPE_LABEL) # OHE labels. T[np.arange(N), np.random.randint(0, M, N)] = TYPE_LABEL(1) # log_loss function require (X, T) in X(N, M), and T(N, M) in OHE label format. X, T = transform_X_T(X, T) layer.T = T Logger.debug("%s: X is \n%s\nT is \n%s", name, X, T) # -------------------------------------------------------------------------------- # Expected analytical gradient EG = (dX/dL) = (A-T)/N # -------------------------------------------------------------------------------- A = sigmoid(X) EG = ((A - T).astype(TYPE_FLOAT) / TYPE_FLOAT(N)) # -------------------------------------------------------------------------------- # Total loss Z = np.sum(J)/N # Expected loss EL = sum((1-T)X + np.log(1 + np.exp(-X))) # (J, P) = sigmoid_cross_entropy_log_loss(X, T) and J:shape(N,) where J:shape(N,) # is loss for each input and P is activation by sigmoid(X). # -------------------------------------------------------------------------------- L = layer.function(X) J, P = sigmoid_cross_entropy_log_loss(X, T) EL = np.array(np.sum((1 - T) * X + logarithm(1 + np.exp(-X))) / N, dtype=TYPE_FLOAT) # Constraint: A == P as they are sigmoid(X) assert np.all(np.abs(A-P) < ACTIVATION_DIFF_ACCEPTANCE_VALUE), \ f"Need A==P==sigmoid(X) but A=\n{A}\n P=\n{P}\n(A-P)=\n{(A-P)}\n" # Constraint: Log loss layer output L == sum(J) from the log loss function Z = np.array(np.sum(J) / N, dtype=TYPE_FLOAT) assert np.array_equal(L, Z), \ f"Need log loss layer output L == sum(J) but L=\n{L}\nZ=\n{Z}." # Constraint: L/loss is close to expected loss EL. assert np.all(np.abs(EL-L) < LOSS_DIFF_ACCEPTANCE_VALUE), \ "Need EL close to L but \nEL=\n{EL}\nL=\n{L}\n" # -------------------------------------------------------------------------------- # constraint: gradient_numerical() == numerical_jacobian(objective, X) # TODO: compare the diff to accommodate numerical errors. # -------------------------------------------------------------------------------- GN = layer.gradient_numerical() # [dL/dX] from the layer def objective(x): """Function to calculate the scalar loss L for cross entropy log loss""" j, p = sigmoid_cross_entropy_log_loss(x, T) return np.array(np.sum(j) / N, dtype=TYPE_FLOAT) EGN = numerical_jacobian(objective, X) # Expected numerical dL/dX assert np.array_equal(GN[0], EGN), \ f"GN[0]==EGN expected but GN[0] is \n%s\n EGN is \n%s\n" % (GN[0], EGN) # ================================================================================ # Layer backward path # ================================================================================ # constraint: Analytical gradient G: gradient() == (P-1)/N. dY = TYPE_FLOAT(1) G = layer.gradient(dY) assert np.all(np.abs(G-EG) <= GRADIENT_DIFF_ACCEPTANCE_VALUE), \ f"Layer gradient dL/dX \n{G} \nneeds to be \n{EG}." # constraint: Analytical gradient G is close to GN: gradient_numerical(). assert \ np.allclose(GN[0], G, atol=GRADIENT_DIFF_ACCEPTANCE_VALUE, rtol=GRADIENT_DIFF_ACCEPTANCE_RATIO), \ f"dX is \n{G}\nGN[0] is \n{GN[0]}\nRDiff is \n{G-GN[0]}.\n" # constraint: Gradient g of the log loss layer needs -1 < g < 1 # abs(P-T) = abs(sigmoid(X)-T) cannot be > 1. assert np.all(np.abs(G) < 1), \ f"Log loss layer gradient cannot be < -1 nor > 1 but\n{G}" assert np.all(np.abs(GN[0]) < (1+GRADIENT_DIFF_ACCEPTANCE_RATIO)), \ f"Log loss layer gradient cannot be < -1 nor > 1 but\n{GN[0]}" profiler.disable() profiler.print_stats(sort="cumtime")
def disabled_test_040_objective_methods_1d_ohe(): """ TODO: Disabled as need to redesign numerical_jacobian for 32 bit floating. Objective: Verify the forward path constraints: 1. Layer output L/loss is np.sum(cross_entropy_log_loss(sigmoid(X), T, f=logistic_log_loss))) / N. 2. gradient_numerical() == numerical Jacobian numerical_jacobian(O, X). Verify the backward path constraints: 1. Analytical gradient G: gradient() == (P-1)/N 2. Analytical gradient G is close to GN: gradient_numerical(). Expected: Initialization detects the access to the non-initialized parameters and fails. For X.ndim > 0, the layer transform X into 2D so as to use the numpy tuple- like indexing: P[ (0,3), (2,4) ] Hence, the shape of GN, G are 2D. """ # -------------------------------------------------------------------------------- # Instantiate a CrossEntropyLogLoss layer # -------------------------------------------------------------------------------- name = "test_040_objective_methods_1d_ohe" N = 1 for _ in range(NUM_MAX_TEST_TIMES): layer = CrossEntropyLogLoss( name=name, num_nodes=1, log_loss_function=sigmoid_cross_entropy_log_loss, log_level=logging.DEBUG) # ================================================================================ # Layer forward path # ================================================================================ X = TYPE_FLOAT( np.random.uniform(low=-BOUNDARY_SIGMOID, high=BOUNDARY_SIGMOID)) T = TYPE_LABEL(np.random.randint(0, 2)) # OHE labels. # log_loss function require (X, T) in X(N, M), and T(N, M) in OHE label format. X, T = transform_X_T(X, T) layer.T = T # Expected analytical gradient dL/dX = (P-T)/N of shape (N,M) A = sigmoid(X) EG = ((A - T) / N).reshape(1, -1).astype(TYPE_FLOAT) Logger.debug("%s: X is \n%s\nT is %s\nP is %s\nEG is %s\n", name, X, T, A, EG) # -------------------------------------------------------------------------------- # constraint: L/loss == np.sum(J) / N. # J, P = sigmoid_cross_entropy_log_loss(X, T) # -------------------------------------------------------------------------------- L = layer.function(X) # L is shape () J, P = sigmoid_cross_entropy_log_loss(X, T) Z = np.array(np.sum(J), dtype=TYPE_FLOAT) / TYPE_FLOAT(N) assert np.array_equal(L, Z), f"LogLoss output should be {L} but {Z}." # -------------------------------------------------------------------------------- # constraint: gradient_numerical() == numerical Jacobian numerical_jacobian(O, X) # Use a dummy layer for the objective function because using the "layer" # updates the X, Y which can interfere the independence of the layer. # -------------------------------------------------------------------------------- GN = layer.gradient_numerical() # [dL/dX] from the layer # -------------------------------------------------------------------------------- # Cannot use CrossEntropyLogLoss.function() to simulate the objective function L. # because it causes applying transform_X_T multiple times. # Because internally transform_X_T(X, T) has transformed T into the index label # in 1D with with length 1 by "T = T.reshape(-1)". # Then providing X in 1D into "dummy.function(x)" re-run "transform_X_T(X, T)" # again. The (X.ndim == T.ndim ==1) as an input and T must be OHE label for such # combination and T.shape == P.shape must be true for OHE labels. # However, T has been converted into the index format already by transform_X_T # (applying transform_X_T multiple times) and (T.shape=(1,1), X.shape=(1, > 1) # that violates the (X.shape == T.shape) constraint. # -------------------------------------------------------------------------------- # dummy = CrossEntropyLogLoss( # name="dummy", # num_nodes=M, # log_level=logging.DEBUG # ) # dummy.T = T # dummy.objective = objective # dummy.function(X) # -------------------------------------------------------------------------------- def objective(x): j, p = sigmoid_cross_entropy_log_loss(x, T) return np.array(np.sum(j) / N, dtype=TYPE_FLOAT) EGN = numerical_jacobian(objective, X).reshape(1, -1) # Expected numerical dL/dX assert np.array_equal(GN[0], EGN), \ f"Layer gradient_numerical GN \n{GN} \nneeds to be \n{EGN}." # ================================================================================ # Layer backward path # ================================================================================ # -------------------------------------------------------------------------------- # constraint: Analytical gradient G: gradient() == (P-1)/N. # -------------------------------------------------------------------------------- dY = TYPE_FLOAT(1) G = layer.gradient(dY) assert np.all(np.abs(G-EG) <= GRADIENT_DIFF_ACCEPTANCE_VALUE), \ f"Layer gradient dL/dX \n{G} \nneeds to be \n{EG}." # -------------------------------------------------------------------------------- # constraint: Analytical gradient G is close to GN: gradient_numerical(). # -------------------------------------------------------------------------------- assert \ np.all(np.abs(G-GN[0]) <= GRADIENT_DIFF_ACCEPTANCE_VALUE) or \ np.all(np.abs(G-GN[0]) <= np.abs(GRADIENT_DIFF_ACCEPTANCE_RATIO * GN[0])), \ "dX is \n%s\nGN is \n%s\nG-GN is \n%s\n Ratio * GN[0] is \n%s.\n" \ % (G, GN[0], G-GN[0], GRADIENT_DIFF_ACCEPTANCE_RATIO * GN[0])
def test_040_objective_instantiation(): """ Objective: Verify the initialized layer instance provides its properties. Expected: * name, num_nodes, M, log_level are the same as initialized. * X, T, dY, objective returns what is set. * N, M property are provided after X is set. * Y, P, L properties are provided after function(X). * gradient(dL/dY) repeats dL/dY, * gradient_numerical() returns 1 """ name = "test_040_objective_instantiation" for _ in range(NUM_MAX_TEST_TIMES): N: int = np.random.randint(1, NUM_MAX_BATCH_SIZE) M: int = 1 # For sigmoid log loss layer, the number of features N in X is the same with node number. D: int = M layer = CrossEntropyLogLoss( name=name, num_nodes=M, log_loss_function=sigmoid_cross_entropy_log_loss, log_level=logging.DEBUG) # -------------------------------------------------------------------------------- # Properties # -------------------------------------------------------------------------------- assert layer.name == name assert layer.num_nodes == layer.M == M layer._D = D assert layer.D == D X = np.random.randn(N, D).astype(TYPE_FLOAT) layer.X = X assert np.array_equal(layer.X, X) assert layer.N == N == X.shape[0] # For sigmoid log loss layer, the number of features N in X is the same with node number. assert layer.M == X.shape[1] layer._dX = X assert np.array_equal(layer.dX, X) T = np.random.randint(0, M, N).astype(TYPE_LABEL) layer.T = T assert np.array_equal(layer.T, T) # layer.function() gives the total loss L in shape (). # log_loss function require (X, T) in X(N, M), and T(N, M) in OHE label format. X, T = transform_X_T(X, T) L = layer.function(X) J, P = sigmoid_cross_entropy_log_loss(X, T) assert \ L.shape == () and np.allclose(L, (np.sum(J) / N).astype(TYPE_FLOAT)) and L == layer.Y, \ "After setting T, layer.function(X) generates the total loss L but %s" % L # layer.function(X) sets layer.P to sigmoid_cross_entropy_log_loss(X, T) # P is nearly equal with sigmoid(X) assert \ np.array_equal(layer.P, P) and \ np.all(np.abs(layer.P - sigmoid(X)) < LOSS_DIFF_ACCEPTANCE_VALUE), \ "layer.function(X) needs to set P as sigmoid_cross_entropy_log_loss(X, T) " \ "which is close to sigmoid(X) but layer.P=\n%s\nP=\n%s\nsigmoid(X)=%s" \ % (layer.P, P, sigmoid(X)) # gradient of sigmoid cross entropy log loss layer is (P-T)/N G = layer.gradient() assert \ np.all(np.abs(G - ((P-T)/N)) < GRADIENT_DIFF_ACCEPTANCE_VALUE), \ "Gradient G needs (P-T)/N but G=\n%s\n(P-T)/N=\n%s\n" % (G, (P-T)/N) layer.logger.debug("This is a pytest") # pylint: disable=not-callable assert \ layer.objective(np.array(1.0, dtype=TYPE_FLOAT)) \ == np.array(1.0, dtype=TYPE_FLOAT), \ "Objective function of the output/last layer is an identity function."
def multilayer_network_specification_bn_to_fail(D, M01, M02, M): sequential_layer_specification_bn_to_fail = { "matmul01": layer.Matmul.specification( name="matmul01", num_nodes=M01, num_features=D, weights_initialization_scheme="he", weights_optimizer_specification=optimiser.SGD.specification( lr=TYPE_FLOAT(0.05), l2=TYPE_FLOAT(1e-3) ) ), "bn01": layer.BatchNormalization.specification( name="bn01", num_nodes=M01, gamma_optimizer_specification=optimiser.SGD.specification( lr=TYPE_FLOAT(0.05), l2=TYPE_FLOAT(1e-3) ), beta_optimizer_specification=optimiser.SGD.specification( lr=TYPE_FLOAT(0.05), l2=TYPE_FLOAT(1e-3), ), momentum=TYPE_FLOAT(0.9) ), "relu01": layer.ReLU.specification( name="relu01", num_nodes=M01, ), "matmul02": layer.Matmul.specification( name="matmul01", num_nodes=M02, num_features=M01, weights_initialization_scheme="he", weights_optimizer_specification=optimiser.SGD.specification( lr=TYPE_FLOAT(0.05), l2=TYPE_FLOAT(1e-3) ) ), "bn02": layer.BatchNormalization.specification( name="bn02", num_nodes=M02, gamma_optimizer_specification=optimiser.SGD.specification( lr=TYPE_FLOAT(0.05), l2=TYPE_FLOAT(1e-3) ), beta_optimizer_specification=optimiser.SGD.specification( lr=TYPE_FLOAT(0.05), l2=TYPE_FLOAT(1e-3), ), momentum=TYPE_FLOAT(0.9) ), "relu02": layer.ReLU.specification( name="relu02", num_nodes=M02, ), "matmul03": layer.Matmul.specification( name="matmul03", num_nodes=M, num_features=M02, weights_initialization_scheme="he", weights_optimizer_specification=optimiser.SGD.specification( lr=TYPE_FLOAT(0.05), l2=TYPE_FLOAT(1e-3) ) ), "bn03": layer.BatchNormalization.specification( name="bn03", num_nodes=M, gamma_optimizer_specification=optimiser.SGD.specification( lr=TYPE_FLOAT(0.05), l2=TYPE_FLOAT(1e-3) ), beta_optimizer_specification=optimiser.SGD.specification( lr=TYPE_FLOAT(0.05), l2=TYPE_FLOAT(1e-3), ), momentum=TYPE_FLOAT(0.9) ), "loss": CrossEntropyLogLoss.specification( name="loss001", num_nodes=M ) } return { _NAME: "two_layer_classifier_with_batch_normalization", _NUM_NODES: M, _LOG_LEVEL: logging.ERROR, _COMPOSITE_LAYER_SPEC: sequential_layer_specification_bn_to_fail }
def train_binary_classifier(N: int, D: int, M: int, X: np.ndarray, T: np.ndarray, W: np.ndarray, log_loss_function: Callable, optimizer: Optimizer, num_epochs: int = 100, test_numerical_gradient: bool = False, log_level: int = logging.ERROR, callback: Callable = None): """Test case for binary classification with matmul + log loss. Args: N: Batch size D: Number of features M: Number of nodes. 1 for sigmoid and 2 for softmax X: train data T: labels W: weight log_loss_function: cross entropy logg loss function optimizer: Optimizer num_epochs: Number of epochs to run test_numerical_gradient: Flag if test the analytical gradient with the numerical one. log_level: logging level callback: callback function to invoke at the each epoch end. """ name = __name__ assert isinstance(T, np.ndarray) and np.issubdtype( T.dtype, np.integer) and T.ndim == 1 and T.shape[0] == N assert isinstance( X, np.ndarray) and X.dtype == TYPE_FLOAT and X.ndim == 2 and X.shape[ 0] == N and X.shape[1] == D assert isinstance( W, np.ndarray) and W.dtype == TYPE_FLOAT and W.ndim == 2 and W.shape[ 0] == M and W.shape[1] == D + 1 assert num_epochs > 0 and N > 0 and D > 0 assert ((log_loss_function == sigmoid_cross_entropy_log_loss and M == 1) or (log_loss_function == softmax_cross_entropy_log_loss and M >= 2)) # -------------------------------------------------------------------------------- # Instantiate a CrossEntropyLogLoss layer # -------------------------------------------------------------------------------- loss = CrossEntropyLogLoss(name="loss", num_nodes=M, log_loss_function=log_loss_function, log_level=log_level) # -------------------------------------------------------------------------------- # Instantiate a Matmul layer # -------------------------------------------------------------------------------- matmul = Matmul(name="matmul", num_nodes=M, W=W, optimizer=optimizer, log_level=log_level) matmul.objective = loss.function num_no_progress: int = 0 # how many time when loss L not decreased. loss.T = T history: List[np.ndarray] = [loss.function(matmul.function(X))] for i in range(num_epochs): # -------------------------------------------------------------------------------- # Layer forward path # Calculate the matmul output Y=f(X), and get the loss L = objective(Y) # Test the numerical gradient dL/dX=matmul.gradient_numerical(). # -------------------------------------------------------------------------------- Y = matmul.function(X) L = loss.function(Y) if not (i % 50): print(f"iteration {i} Loss {L}") Logger.info("%s: iteration[%s]. Loss is [%s]", name, i, L) # -------------------------------------------------------------------------------- # Constraint: 1. Objective/Loss L(Yn+1) after gradient descent < L(Yn) # -------------------------------------------------------------------------------- if L >= history[-1] and (i % 20) == 1: Logger.warning( "Iteration [%i]: Loss[%s] has not improved from the previous [%s].", i, L, history[-1]) if (num_no_progress := num_no_progress + 1) > 20: Logger.error( "The training has no progress more than %s times.", num_no_progress) # break else: num_no_progress = 0 history.append(L) # -------------------------------------------------------------------------------- # Expected dL/dW.T = X.T @ dL/dY = X.T @ (P-T) / N, and dL/dX = dL/dY @ W # P = sigmoid(X) or softmax(X) # dL/dX = dL/dY * W is to use W BEFORE updating W. # -------------------------------------------------------------------------------- P = None if log_loss_function == sigmoid_cross_entropy_log_loss: # P = sigmoid(np.matmul(X, W.T)) P = sigmoid(np.matmul(matmul.X, matmul.W.T)) P = P - T.reshape(-1, 1) # T(N,) -> T(N,1) to align with P(N,1) assert P.shape == ( N, 1), "P.shape is %s T.shape is %s" % (P.shape, T.shape) elif log_loss_function == softmax_cross_entropy_log_loss: # matmul.X.shape is (N, D+1), matmul.W.T.shape is (D+1, M) P = softmax(np.matmul(matmul.X, matmul.W.T)) # (N, M) P[np.arange(N), T] -= 1 EDX = np.matmul(P / N, matmul.W) # (N,M) @ (M, D+1) -> (N, D+1) EDX = EDX[::, 1:] # Hide the bias -> (N, D) EDW = np.matmul(matmul.X.T, P / N).T # ((D+1,N) @ (N, M)).T -> (M, D+1) # -------------------------------------------------------------------------------- # Layer backward path # 1. Calculate the analytical gradient dL/dX=matmul.gradient(dL/dY) with a dL/dY. # 2. Gradient descent to update Wn+1 = Wn - lr * dL/dX. # -------------------------------------------------------------------------------- before = copy.deepcopy(matmul.W) dY = loss.gradient(TYPE_FLOAT(1)) dX = matmul.gradient(dY) # gradient descent and get the analytical gradients dS=[dL/dX, dL/dW] # dL/dX.shape = (N, D) # dL/dW.shape = (M, D+1) dS = matmul.update() dW = dS[0] # -------------------------------------------------------------------------------- # Constraint 1. W in the matmul has been updated by the gradient descent. # -------------------------------------------------------------------------------- Logger.debug("W after is \n%s", matmul.W) assert not np.array_equal(before, matmul.W), "W has not been updated." if not validate_against_expected_gradient(EDX, dX): Logger.warning("Expected dL/dX \n%s\nDiff\n%s", EDX, EDX - dX) if not validate_against_expected_gradient(EDW, dW): Logger.warning("Expected dL/dW \n%s\nDiff\n%s", EDW, EDW - dW) if test_numerical_gradient: # -------------------------------------------------------------------------------- # Numerical gradients gn=[dL/dX, dL/dW] # dL/dX.shape = (N, D) # dL/dW.shape = (M, D+1) # -------------------------------------------------------------------------------- gn = matmul.gradient_numerical() validate_against_numerical_gradient([dX] + dS, gn, Logger) if callback: # if W.shape[1] == 1 else callback(W=np.average(matmul.W, axis=0)) callback(W=matmul.W[0])
def validate_relu_neuron_training(matmul: Matmul, activation: ReLU, loss: CrossEntropyLogLoss, X: np.ndarray, T: np.ndarray, num_epochs: int = 100, test_numerical_gradient: bool = False, callback: Callable = None): activation.objective = loss.function matmul.objective = compose(activation.function, loss.function) objective = compose(matmul.function, matmul.objective) num_no_progress: int = 0 # how many time when loss L not decreased. history: List[np.ndarray] = [] loss.T = T for i in range(num_epochs): L = objective(X) N = X.shape[0] P = softmax(relu(np.matmul(matmul.X, matmul.W.T))) EDA = expected_gradient_from_log_loss(P=P, T=T, N=N) # ******************************************************************************** # Constraint: Expected gradients must match actual # ******************************************************************************** validate_relu_neuron_round_trip(matmul=matmul, activation=activation, X=X, dA=EDA) # -------------------------------------------------------------------------------- # gradient descent and get the analytical dL/dX, dL/dW # -------------------------------------------------------------------------------- previous_W = copy.deepcopy(matmul.W) matmul.update() # dL/dX, dL/dW # ******************************************************************************** # Constraint. W in the matmul has been updated by the gradient descent. # ******************************************************************************** Logger.debug("W after is \n%s", matmul.W) if np.array_equal(previous_W, matmul.W): Logger.warning("W has not been updated") # ******************************************************************************** # Constraint: Objective/Loss L(Yn+1) after gradient descent < L(Yn) # ******************************************************************************** if i > 0 and L >= history[-1]: Logger.warning( "Iteration [%i]: Loss[%s] has not improved from the previous [%s] for %s times.", i, L, history[-1], num_no_progress + 1) # -------------------------------------------------------------------------------- # Reduce the learning rate can make the situation worse. # When reduced the lr every time L >= history, the (L >= history) became successive # and eventually exceeded 50 successive non-improvement ending in failure. # Keep the learning rate make the L>=history more frequent but still up to 3 # successive events, and the training still kept progressing. # -------------------------------------------------------------------------------- num_no_progress += 1 if num_no_progress > 5: matmul.lr = matmul.lr * 0.95 if num_no_progress > 50: Logger.error( "The training has no progress more than %s times.", num_no_progress) break else: num_no_progress = 0 history.append(L) if callback: callback(W=matmul.W) return history