def event_context_pairs( sequence: TYPE_TENSOR, window_size: int, event_size: int, ) -> TYPE_TENSOR: """Create (event, context) pairs from a sequence For a sequence (a, b, c, d, e) of window_size 3 and event_size 1. (event, context) = [ event=b, context=(a, c), event=c, context=(b, d), event=d, context=(e, e) ] Args: sequence: 1 dimensional tensor window_size: size of the context including the event event_size: size of the event which can be > 1 Returns: (event, context) pairs of shape:(N, E+C) where N=num_windows. [ [b, a, c], [c, b, d], [d, e, e] ] where the first column(s) is event and the rest are context """ length: TYPE_INT = TYPE_INT(len(sequence)) stride: TYPE_INT = TYPE_INT((window_size - event_size) / 2) assert \ super(Function, Function).is_tensor(sequence) and \ super(Function, Function).tensor_rank(sequence) == 1 and \ length >= window_size > event_size > 0, \ f"Expected a sequence of length >= {window_size} but {sequence}" assert (window_size - event_size) % 2 == 0, "Need stride as integer > 0" # -------------------------------------------------------------------------------- # The result is a matrix in which windows are stacked up. # The result shape is (length-window_size+1, window_size). # -------------------------------------------------------------------------------- num_windows = length - window_size + 1 context_windows = np.array([ sequence[slice(index, index + window_size)] for index in range(0, num_windows) ], dtype=TYPE_INT) assert context_windows.shape == (num_windows, window_size) event_context_paris = np.c_[context_windows[ # Labels ::, slice(stride, (stride + event_size))], context_windows[ # Left context ::, slice(0, stride)], context_windows[ # Right context ::, slice((event_size + stride), None)]] assert event_context_paris.shape == (num_windows, window_size) return event_context_paris
def test_020_event_context_instance_properties(caplog): """ Objective: Verify the layer class validates the parameters have been initialized before accessed. Expected: Initialization detects the access to the non-initialized parameters and fails. """ caplog.set_level(logging.DEBUG) name = "test_020_event_context_instance_properties" msg = "Accessing uninitialized property of the layer must fail." profiler = cProfile.Profile() profiler.enable() for _ in range(NUM_MAX_TEST_TIMES): stride: TYPE_INT = TYPE_INT(np.random.randint(1, 100)) event_size: TYPE_INT = TYPE_INT(np.random.randint(1, 100)) window_size: TYPE_INT = 2 * stride + event_size name = random_string(np.random.randint(1, 10)) event_context = _must_succeed(name=name, num_nodes=TYPE_INT(1), window_size=window_size, event_size=event_size, msg=msg) # -------------------------------------------------------------------------------- # To pass # -------------------------------------------------------------------------------- try: if not event_context.name == name: raise RuntimeError("event_context.name == name should be true") except AssertionError as e: raise RuntimeError( "Access to name should be allowed as already initialized." ) from e try: if not isinstance(event_context.logger, logging.Logger): raise RuntimeError( "isinstance(event_context.logger, logging.Logger) should be true" ) except AssertionError as e: raise RuntimeError( "Access to logger should be allowed as already initialized." ) from e assert event_context.window_size == window_size assert event_context.event_size == event_size profiler.disable() profiler.print_stats(sort="cumtime")
def _instantiate( name: str = __name__, num_nodes: TYPE_INT = 1, target_size: TYPE_INT = 1, context_size: TYPE_INT = 4, negative_sample_size: TYPE_INT = 10, event_vector_size: TYPE_INT = 20, dictionary: EventIndexing = None, W: TYPE_TENSOR = None, log_level: int = logging.ERROR ) -> Tuple[Embedding, EventContext]: event_context: EventContext = _instantiate_event_context( name=name, num_nodes=TYPE_INT(1), window_size=(context_size+target_size), event_size=target_size ) embedding: Embedding = Embedding( name=name, num_nodes=num_nodes, target_size=target_size, context_size=context_size, negative_sample_size=negative_sample_size, event_vector_size=event_vector_size, dictionary=dictionary, W=W, log_level=log_level ) return embedding, event_context
def test_020_embedding_function_multi_lines(caplog): """ Objective: Verify the EventIndexing function can handle multi line sentences Expected: """ caplog.set_level(logging.DEBUG) name = "test_020_embedding_function_multi_lines" sentences = """ Verify the EventIndexing function can handle multi line sentences the asbestos fiber <unk> is unusually <unk> once it enters the <unk> with even brief exposures to it causing symptoms that show up decades later researchers said """ dictionary: EventIndexing = _instantiate_event_indexing() profiler = cProfile.Profile() profiler.enable() for _ in range(NUM_MAX_TEST_TIMES): # First validate the correct configuration, then change parameter one by one. target_size = TYPE_INT(np.random.randint(1, 3)) context_size = TYPE_INT(2 * np.random.randint(1, 5)) negative_sample_size = TYPE_INT(np.random.randint(1, 5)) event_vector_size: TYPE_INT = TYPE_INT(np.random.randint(5, 20)) W: TYPE_TENSOR = np.random.randn(dictionary.vocabulary_size, event_vector_size) embedding, event_context = _must_succeed( name=name, num_nodes=(1+negative_sample_size), target_size=target_size, context_size=context_size, negative_sample_size=negative_sample_size, event_vector_size=event_vector_size, dictionary=dictionary, W=W, log_level=logging.DEBUG, msg="must succeed" ) sequences = dictionary.function(sentences) target_context_pairs = event_context.function(sequences) embedding.function(target_context_pairs) profiler.disable() profiler.print_stats(sort="cumtime")
def test_010_standardize_sd_is_zero_eps(): """ Objective: Verify the standardize() function when SD is zero. Expected: standardized == (X-mean)/sqrt(eps) """ name = "test_010_standardize" keepdims = True u = TYPE_FLOAT(1e-6) eps = TYPE_FLOAT(1e-8) for _ in range(NUM_MAX_TEST_TIMES): N: int = np.random.randint(1, NUM_MAX_BATCH_SIZE) M: int = np.random.randint(2, NUM_MAX_NODES) row = np.random.uniform(-MAX_ACTIVATION_VALUE, MAX_ACTIVATION_VALUE, M).astype(TYPE_FLOAT) X = np.ones(shape=(N, M), dtype=TYPE_FLOAT) * row Logger.debug("%s: X \n%s\n", name, X) # Constraint: standardize(X) == (X - np.mean(A)) / np.std(X) ddof = TYPE_INT(1) if N > 1 else TYPE_INT(0) sd = np.std(X, axis=0, keepdims=keepdims, ddof=ddof) assert np.allclose(sd, TYPE_FLOAT(0), atol=u, rtol=0) # Expected mean = np.mean(X, axis=0, dtype=TYPE_FLOAT) E = (X - mean) / np.sqrt(eps, dtype=TYPE_FLOAT) # Actual A, __mean, __sd, _ = standardize(X, keepdims=keepdims, eps=eps) # Constraint. mean/sd should be same assert np.allclose(mean, __mean, atol=u, rtol=TYPE_FLOAT(0)) assert np.allclose(__sd, np.sqrt(eps), atol=u, rtol=TYPE_FLOAT(0)) assert np.all(np.abs(E-A) < u), \ f"X\n{X}\nstandardized\n{E}\nneeds\n{A}\n"
def test_020_event_context_function_event_size_2(caplog): """ Objective: Verify the layer function with event_size = 2 Expected: function generates expected event_context pairs """ caplog.set_level(logging.DEBUG) name = "test_020_event_context_instance_properties" msg = "Accessing uninitialized property of the layer must fail." profiler = cProfile.Profile() profiler.enable() stride: TYPE_INT = 2 event_size: TYPE_INT = 2 window_size: TYPE_INT = 2 * stride + event_size num_nodes = TYPE_INT(1) event_context = _must_succeed(name=name, num_nodes=num_nodes, window_size=window_size, event_size=event_size, msg=msg) # 0, 1, 2, 3, 4, 5 # expected event_context pair: [ # [2, 3, 0, 1, 4, 5], X1 = np.arange(6) expected1 = np.array([[2, 3, 0, 1, 4, 5]]) Y1 = event_context.function(X1) assert \ np.array_equal(Y1, expected1), \ "Expected\n%s\nActual\n%s\n" % (expected1, Y1) # 0, 1, 2, 3, 4, 5, 6 # expected event_context pair: [ # [2, 3, 0, 1, 4, 5], X2 = np.arange(7) expected2 = np.array([[2, 3, 0, 1, 4, 5], [3, 4, 1, 2, 5, 6]]) Y2 = event_context.function(X2) assert np.array_equal(Y2, expected2), \ "Expected\n%s\nActual\n%s\n" % (expected2, Y2) profiler.disable() profiler.print_stats(sort="cumtime")
def test_020_std_method_function_to_succeed(): """ Objective: Verify the _layer class instance function method Expected: Layer method calculate expected values. """ def objective(x: TYPE_TENSOR): """Dummy objective function""" return np.sum(x, dtype=TYPE_FLOAT) profiler = cProfile.Profile() profiler.enable() for _ in range(NUM_MAX_TEST_TIMES): name = random_string(np.random.randint(1, 10)) numexpr_enabled = bool(np.random.randint(0, 2)) numba_enabled = bool(np.random.randint(0, 2)) # For which works on statistics on per-feature basis, # no sense if M = 1 or N = 1. N: int = np.random.randint(1, NUM_MAX_BATCH_SIZE) M: int = np.random.randint(2, NUM_MAX_NODES) X = np.random.randn(N, M).astype(TYPE_FLOAT) momentum = TYPE_FLOAT(np.random.uniform(0.7, 0.99)) eps = TYPE_FLOAT(np.random.uniform(1e-12, 1e-10)) \ if np.random.uniform() < 0.5 else TYPE_FLOAT(0) _layer: Standardization = \ _instance(name=name, num_nodes=M, momentum=momentum, eps=eps) _layer.objective = objective # ******************************************************************************** # Constraint: total_rows_processed = times_of_invocations * N # ******************************************************************************** assert _layer.total_rows_processed == 0 ru = _layer.RU rsd = _layer.RSD _layer.function( X, numexpr_enabled=numexpr_enabled, ) _validate_layer_values(_layer, X, eps=eps) _validate_layer_running_statistics( _layer=_layer, previous_ru=ru, previous_rsd=rsd, X=X, eps=eps ) # ******************************************************************************** # Constraint: # _layer.N provides the latest X.shape[0] # X related arrays should have its storage allocated and has the X.shape. # * dX # * dXmd01 # * dXmd02 # ******************************************************************************** assert _layer.N == X.shape[0] assert \ _layer.dX.dtype == TYPE_FLOAT and \ _layer.dX.shape == (N, M) assert \ _layer.dXmd01.dtype == TYPE_FLOAT and \ _layer.dXmd01.shape == (N, M) assert \ _layer.dXmd02.dtype == TYPE_FLOAT and \ _layer.dXmd02.shape == (N, M) assert _layer.total_rows_processed == N # ******************************************************************************** # Constraint: total_rows_processed = times_of_invocations * N # ******************************************************************************** for i in range(np.random.randint(1, 100)): _layer.function( X, numexpr_enabled=numexpr_enabled, ) assert _layer.total_rows_processed == TYPE_INT(N * (i + 2)) profiler.disable() profiler.print_stats(sort="cumtime")
def test_010_base_instantiation(): """Test case for layer base class """ # -------------------------------------------------------------------------------- # name, num_nodes, log_level _init_ properties. # Logging debug outputs. # X setter/getter # T setter/getter # objective function setter/getter # function(x) repeats x. # gradient(dL/dY) repeats dL/dY, # gradient_numerical() returns 1 # -------------------------------------------------------------------------------- def objective(X: np.ndarray) -> Union[TYPE_FLOAT, np.ndarray]: """Dummy objective function""" return np.sum(X) N: int = np.random.randint(1, NUM_MAX_BATCH_SIZE) M: int = np.random.randint(1, NUM_MAX_NODES) name = "test_010_base" _layer: Layer = Layer(name=name, num_nodes=M, log_level=logging.DEBUG) # -------------------------------------------------------------------------------- # Properties # -------------------------------------------------------------------------------- assert _layer.name == name assert _layer.num_nodes == _layer.M == M _layer._D = 1 assert _layer.D == 1 X = np.random.randn(N, M).astype(TYPE_FLOAT) _layer.X = X assert np.array_equal(_layer.X, X), \ "Expected:\n%s\nDiff\n%s\n" % (X, (_layer.X-X)) assert _layer.N == N _layer._dX = X assert np.array_equal(_layer.dX, X) T = np.random.randint(0, M, N).astype(TYPE_LABEL) _layer.T = T assert np.array_equal(_layer.T, T) _layer._Y = np.dot(X, X.T) assert np.array_equal(_layer.Y, np.dot(X, X.T)) _layer._dY = np.array(0.9, dtype=TYPE_FLOAT) assert _layer._dY == np.array(0.9, dtype=TYPE_FLOAT) _layer.logger.debug("This is a pytest") # -------------------------------------------------------------------------------- # Methods # -------------------------------------------------------------------------------- try: # pylint: disable=not-callable _layer.function(TYPE_INT(1)) raise RuntimeError("Invoke layer.function(int(1)) must fail.") except AssertionError: pass x = np.array(1.0, dtype=TYPE_FLOAT) assert np.array_equal(_layer.function(x), x) try: Y = _layer.function(X) assert Y.ndim > 0 _layer.gradient(int(1)) raise RuntimeError("Invoke layer.gradient(int(1)) must fail.") except AssertionError: pass Y = _layer.function(X) assert np.array_equal(_layer.gradient(Y), Y) _layer.objective = objective # pylint: disable=not-callable assert np.array_equal(_layer.objective(X), objective(X))
def stride(self) -> TYPE_INT: """Length of preceding and succeeding context""" return TYPE_INT((self.window_size - self.event_size) / 2)
def vocabulary_size(self) -> TYPE_INT: """Number of unique events in the vocabulary""" return TYPE_INT(len(self.vocabulary))
def test_word2vec(): USE_TEXT8 = False USE_PTB = not USE_TEXT8 # -------------------------------------------------------------------------------- # text8 is one gigantic line having millions of words, which cannot fit in a vector. # Need to split into N words per line. # cat text8 | xargs -n $N > text8_$N # -------------------------------------------------------------------------------- CORPUS_FILE = "text8_512" if USE_TEXT8 else "ptb" CORPUS_URL = "https://data.deepai.org/text8.zip" \ if USE_TEXT8 else 'https://raw.githubusercontent.com/tomsercu/lstm/master/data/ptb.test.txt' TARGET_SIZE = TYPE_INT(1) # Size of the target event (word) CONTEXT_SIZE = TYPE_INT( 6) # Size of the context in which the target event occurs. WINDOW_SIZE = TARGET_SIZE + CONTEXT_SIZE SAMPLE_SIZE = TYPE_INT(6) # Size of the negative samples VECTOR_SIZE = TYPE_INT(100) # Number of features in the event vector. WEIGHT_SCHEME = "normal" WEIGHT_PARAMS = {"std": 0.01} LR = TYPE_FLOAT(20.0) # For text8_512 where one line has 512 words, the number of sentences to take in is small. # For ptb_train where one line may have only a few words, take in more lines to avoid a # (event, context) pair where event is an invalid padding word. # Because of padding, when there are only a few sentence provided, # the 'event' in (event,context) can result in 0. # [[562 58 117 0 0 0 0 0 0 0 0]] NUM_SENTENCES = 1 if USE_TEXT8 else 10 STATE_FILE = \ "/home/oonisim/home/repository/git/oonisim/python_programs/nlp/models/" \ "word2vec_sgram_%s_E%s_C%s_S%s_W%s_%s_%s_V%s_LR%s_N%s.pkl" % ( CORPUS_FILE, TARGET_SIZE, CONTEXT_SIZE, SAMPLE_SIZE, WEIGHT_SCHEME, "std", WEIGHT_PARAMS["std"], VECTOR_SIZE, LR, NUM_SENTENCES, ) MAX_ITERATIONS = 30 # -------------------------------------------------------------------------------- # Corpus text # -------------------------------------------------------------------------------- path_to_corpus = f"{str(Path.home())}/.keras/datasets/{CORPUS_FILE}" if fileio.Function.is_file(path_to_corpus): pass else: path_to_corpus = tf.keras.utils.get_file(fname=CORPUS_FILE, origin=CORPUS_URL, extract=True) corpus = fileio.Function.read_file(path_to_corpus) # -------------------------------------------------------------------------------- # Logistic Log Loss # -------------------------------------------------------------------------------- loss = CrossEntropyLogLoss( name="loss", num_nodes=1, # Logistic log loss log_loss_function=sigmoid_cross_entropy_log_loss) # -------------------------------------------------------------------------------- # Event indexing # -------------------------------------------------------------------------------- word_indexing = EventIndexing(name="word_indexing", corpus=corpus, min_sequence_length=WINDOW_SIZE) del corpus # -------------------------------------------------------------------------------- # Event Context # -------------------------------------------------------------------------------- event_context = EventContext(name="ev", window_size=WINDOW_SIZE, event_size=TARGET_SIZE) # -------------------------------------------------------------------------------- # Event Embedding # -------------------------------------------------------------------------------- embedding: Embedding = Embedding( name="embedding", num_nodes=WINDOW_SIZE, target_size=TARGET_SIZE, context_size=CONTEXT_SIZE, negative_sample_size=SAMPLE_SIZE, event_vector_size=VECTOR_SIZE, optimizer=SGD(lr=LR), dictionary=word_indexing, weight_initialization_scheme=WEIGHT_SCHEME, weight_initialization_parameters=WEIGHT_PARAMS) # -------------------------------------------------------------------------------- # Adapter between Embedding and Log Loss # -------------------------------------------------------------------------------- adapter_function = embedding.adapt_function_to_logistic_log_loss(loss=loss) adapter_gradient = embedding.adapt_gradient_to_logistic_log_loss() adapter: Adapter = Adapter( name="adapter", num_nodes=TYPE_INT(1), # Number of output M=1 function=adapter_function, gradient=adapter_gradient) # -------------------------------------------------------------------------------- # Network # -------------------------------------------------------------------------------- network = SequentialNetwork( name="word2vec", num_nodes=1, inference_layers=[word_indexing, event_context, embedding, adapter], objective_layers=[loss]) def sentences_generator(path_to_file, num_sentences): stream = fileio.Function.file_line_stream(path_to_file) try: while True: yield np.array(fileio.Function.take(num_sentences, stream)) finally: stream.close() # Restore the state if exists. if fileio.Function.is_file(STATE_FILE): state = embedding.load(STATE_FILE) fmt = """Model loaded. event_size %s context_size: %s event_vector_size: %s """ print(fmt % (state["target_size"], state["context_size"], state["event_vector_size"])) else: print("State file does not exist. Saving the initial model %s." % STATE_FILE) embedding.save(STATE_FILE) # Continue training profiler = cProfile.Profile() profiler.enable() total_sentences = 0 epochs = 0 source = sentences_generator(path_to_file=path_to_corpus, num_sentences=NUM_SENTENCES) for i in range(MAX_ITERATIONS): try: sentences = next(source) total_sentences += len(sentences) start = time.time() network.train(X=sentences, T=np.array([0])) if i % 100 == 0: print(f"Batch {i:05d} of {NUM_SENTENCES} sentences: " f"Average Loss: {np.mean(network.history):10f} " f"Duration {time.time() - start:3f}") if i % 10000 == 0: embedding.save(STATE_FILE) pass except fileio.Function.GenearatorHasNoMore as e: source.close() embedding.save(STATE_FILE) # Next epoch print(f"epoch {epochs} batches {i:05d} done") epochs += 1 source = sentences_generator(path_to_file=path_to_corpus, num_sentences=NUM_SENTENCES) except Exception as e: print("Unexpected error:", sys.exc_info()[0]) source.close() raise e embedding.save(STATE_FILE) profiler.disable() profiler.print_stats(sort="cumtime")
def test_020_embedding_save_load(caplog): """ Objective: Verify the save/load function of EventIndexing Expected: """ caplog.set_level(logging.DEBUG) name = "test_020_embedding_save_load" sentences = """ Verify the gradient descent, especially np.ufunc.at, is working as expected """ dictionary: EventIndexing = _instantiate_event_indexing() profiler = cProfile.Profile() profiler.enable() for _ in range(10): # First validate the correct configuration, then change parameter one by one. target_size = TYPE_INT(np.random.randint(1, 2)) context_size = TYPE_INT(2 * np.random.randint(1, 2)) negative_sample_size = TYPE_INT(np.random.randint(1, 2)) event_vector_size: TYPE_INT = TYPE_INT(np.random.randint(2, 3)) W: TYPE_TENSOR = np.random.randn(dictionary.vocabulary_size, event_vector_size) embedding, event_context = _must_succeed( name=name, num_nodes=(1+negative_sample_size), target_size=target_size, context_size=context_size, negative_sample_size=negative_sample_size, event_vector_size=event_vector_size, dictionary=dictionary, W=W, log_level=logging.DEBUG, msg="must succeed" ) # -------------------------------------------------------------------------------- # Run methods and save the results to compare later # -------------------------------------------------------------------------------- sequences = dictionary.function(sentences) target_context_pairs = event_context.function(sequences) Y1 = embedding.function(target_context_pairs) # -------------------------------------------------------------------------------- # Save the layer state and invalidate the state variables # -------------------------------------------------------------------------------- tester = testing.layer.Function(instance=embedding) tester.test_save() backup_W = copy.deepcopy(embedding.W) embedding._W = np.empty(0) # -------------------------------------------------------------------------------- # Confirm the layer does not function anymore # -------------------------------------------------------------------------------- try: embedding.function(target_context_pairs) raise RuntimeError("Must fail with state deleted") except Exception as e: pass # -------------------------------------------------------------------------------- # Restore the state and confirm the layer functions as expected. # Because of random negative sampling, the result of context part differs every time. # Hence only true label (target) part can be the same. # -------------------------------------------------------------------------------- try: # Constraint: # Layer works after state reloaded tester.test_load() assert np.allclose(backup_W, embedding.W) Y2 = embedding.function(target_context_pairs) assert \ np.array_equal(Y1[::, 0], Y2[::,0]), \ "Expected Y\n%s\nActual\n%s\ndiff\n%s\n" \ % (Y1[::, 0], Y2[::,0], (Y1[::,0]-Y2[::,0])) except Exception as e: raise e finally: tester.clean() profiler.disable() profiler.print_stats(sort="cumtime")
def test_020_adapt_embedding_loss_adapter_function_to_fail(caplog): """ Objective: Verify the Adapter function invalidate the shape other than - Y:(N, 1+SL) - ys:(N,SL) - ye:(N,1) Expected: Adapter.function(Y) to fail when Y shape is not (N, 1+SL) """ caplog.set_level(logging.DEBUG) name = "test_020_adapt_embedding_logistic_loss_function_multi_lines" sentences = """ Verify the EventIndexing function can handle multi line sentences the asbestos fiber <unk> is unusually <unk> once it enters the <unk> with even brief exposures to it causing symptoms that show up decades later researchers said """ dictionary: EventIndexing = _instantiate_event_indexing() profiler = cProfile.Profile() profiler.enable() for _ in range(NUM_MAX_TEST_TIMES): # First validate the correct configuration, then change parameter one by one. E = target_size = TYPE_INT(np.random.randint(1, 3)) C = context_size = TYPE_INT(2 * np.random.randint(1, 5)) SL = negative_sample_size = TYPE_INT(np.random.randint(1, 5)) event_vector_size: TYPE_INT = TYPE_INT(np.random.randint(5, 20)) W: TYPE_TENSOR = np.random.randn(dictionary.vocabulary_size, event_vector_size) loss, adapter, embedding, event_context = _instantiate( name=name, num_nodes=TYPE_INT(1), target_size=target_size, context_size=context_size, negative_sample_size=negative_sample_size, event_vector_size=event_vector_size, dictionary=dictionary, W=W, log_level=logging.DEBUG, ) sequences = dictionary.function(sentences) target_context_pairs = event_context.function(sequences) Y = embedding.function(target_context_pairs) N, _ = embedding.tensor_shape(Y) # ******************************************************************************** # Constraint: Y in shape:(N,M) where M > SL+1 must fail # ******************************************************************************** shape = (N, 1 + SL + np.random.randint(1, 100)) msg = "Y shape %s which is not the expected shape %s must fail" % \ (shape, (N, SL+1)) dummy_Y = np.random.uniform(size=shape).astype(TYPE_FLOAT) _function_must_fail(adapter=adapter, Y=dummy_Y, msg=msg) # ******************************************************************************** # Constraint: Y in shape (N+,) must fail. # Adapter function can accept (N,) but not (N+,) # ******************************************************************************** shape = (N + np.random.randint(1, 100), ) msg = "Y shape %s which is not the expected shape %s must fail" % \ (shape, (N,)) dummy_Y = np.random.uniform(size=shape).astype(TYPE_FLOAT) _function_must_fail(adapter=adapter, Y=dummy_Y, msg=msg) profiler.disable() profiler.print_stats(sort="cumtime")
def test_020_embedding_gradient_vs_autodiff(caplog): """ Objective: Verify the Embedding analytical gradient with TF autodiff implemented in the gradient_numerical() method of the layer. Expected: Gradients [dWe, dWs, dWc] calculated in gradient() method matches with those calculated in the gradient_numerical(). """ caplog.set_level(logging.DEBUG) name = "test_020_embedding_gradient_multi_lines" dictionary: EventIndexing = _instantiate_event_indexing() from function import text from . test_020_embedding_sample_sentences import ( bbc_world_us_canada_56988381 as sentences ) max_sentence_length = TYPE_INT(text.Function.max_sentence_length(sentences)) assert max_sentence_length >= 3 profiler = cProfile.Profile() profiler.enable() def L(x): loss = Function.sum( x, axis=None, keepdims=False ) return loss # -------------------------------------------------------------------------------- # Ye = einsum("nd,ncd->n", Bc:(N,D), We:(N,E,D)) # dL/dWe:(N,E,D) = dL/dYe * dYe/dWe = dL/dYe * Bc # # Ys = einsum("nd,nsd->ns", Bc:(N,D), Ws:(N,SL,D)) # dL/dWs:(N,SL,D) = dL/dYs * dYs/dWs = dL/dYs * Bc # # By setting # 1. dL/dY = np.c_[dL/dYe,dL/dYs] = I and # 2. context_size C == negative_sample_size SL # The constraint is E * dL/dWe == dL/dWs == Bc because dL/dYe, dL/dYs are I. # dL/dWe is normalized with E to be independent from the event (target) size. # -------------------------------------------------------------------------------- for _ in range(NUM_MAX_TEST_TIMES): # C must be even number C = TYPE_INT(np.random.randint(1, max_sentence_length / 2) * 2) assert C < max_sentence_length # E=SL for (N,E,D) and (N,SL,D) has the same shape E = SL = TYPE_INT( np.random.randint( 1, min( Embedding.MAX_TARGET_SIZE, Embedding.MAX_NEGATIVE_SAMPLE_SIZE, (max_sentence_length - C) )+1 ) ) target_size = negative_sample_size = TYPE_INT(E) context_size = TYPE_INT(C) event_vector_size: TYPE_INT = TYPE_INT(np.random.randint(1, 100)) W: TYPE_TENSOR = np.random.randn(dictionary.vocabulary_size, event_vector_size) embedding, event_context = _must_succeed( name=name, num_nodes=(1+negative_sample_size), target_size=target_size, context_size=context_size, negative_sample_size=negative_sample_size, event_vector_size=event_vector_size, dictionary=dictionary, W=W, log_level=logging.DEBUG, msg="must succeed" ) embedding.objective = L sequences = dictionary.function(sentences) target_context_pairs = event_context.function(sequences) # -------------------------------------------------------------------------------- # Forward path # -------------------------------------------------------------------------------- Y = embedding.function(target_context_pairs) EDWe, EDWs, EDWc = embedding.gradient_numerical() # -------------------------------------------------------------------------------- # Backward path # -------------------------------------------------------------------------------- dY = Function.ones(shape=Function.tensor_shape(Y)) embedding.gradient(dY) # -------------------------------------------------------------------------------- # Backward path # -------------------------------------------------------------------------------- dWe, dWs, dWc = embedding.update() # ******************************************************************************** # Constraint: # - dW is close to EDW # - dL/dWe = dL/dWs = Bc when dL/dY = I # ******************************************************************************** assert Function.all_close( EDWe, dWe ), "Expected (EDWe==dWe)\n%s\ndifference\n%s\n" % (EDWe, EDWe-dWe) assert Function.all_close( EDWs, dWs ), "Expected (EDWs==dWs)\n%s\ndifference\n%s\n" % (EDWs, EDWs-dWs) assert Function.all_close( EDWc, dWc ), "Expected (EWc5==W[5])\n%s\ndifference\n%s\n" % (EDWc, EDWc-dWc) assert Function.all_close( dWe * E, dWs ), "Expected (dWe==dWs) but dWe:\n%s\ndifference\n%s\n" % (dWe, dWe-dWs) profiler.disable() profiler.print_stats(sort="cumtime")
def test_020_embedding_instance_properties_access_to_succeed(caplog): """ Objective: Verify the layer class validates the parameters have been initialized before accessed. Expected: Initialization detects the access to the initialized parameters and succeeds. """ caplog.set_level(logging.DEBUG) name = "test_020_embedding_instance_properties_access_to_succeed" dictionary: EventIndexing = _instantiate_event_indexing() profiler = cProfile.Profile() profiler.enable() for _ in range(NUM_MAX_TEST_TIMES): # First validate the correct configuration, then change parameter one by one. target_size = TYPE_INT(np.random.randint(1, 10)) context_size = TYPE_INT(2 * np.random.randint(1, 10)) negative_sample_size = TYPE_INT(np.random.randint(5, 20)) event_vector_size: TYPE_INT = TYPE_INT(np.random.randint(5, 100)) W: TYPE_TENSOR = np.random.rand(dictionary.vocabulary_size, event_vector_size).astype(TYPE_FLOAT) embedding, event_context = _must_succeed( name=name, num_nodes=(1+negative_sample_size), target_size=target_size, context_size=context_size, negative_sample_size=negative_sample_size, event_vector_size=event_vector_size, dictionary=dictionary, W=W, log_level=logging.DEBUG, msg="must succeed" ) # -------------------------------------------------------------------------------- # To pass # -------------------------------------------------------------------------------- try: if not embedding.name == name: raise RuntimeError("embedding.name == name should be true") except AssertionError: raise RuntimeError("Access to name should be allowed as already initialized.") try: if not embedding.V == dictionary.vocabulary_size: raise RuntimeError("embedding.V == vocabulary_size should be true") except AssertionError: raise RuntimeError("Access to V should be allowed as already initialized.") try: if not embedding.E == target_size: raise RuntimeError("embedding.V == target_size should be true") except AssertionError: raise RuntimeError("Access to E should be allowed as already initialized.") try: if not embedding.C == context_size: raise RuntimeError("embedding.C == context_size should be true") except AssertionError: raise RuntimeError("Access to C should be allowed as already initialized.") try: if not embedding.window_size == target_size+context_size: raise RuntimeError("embedding.window_size == target_size+context_size should be true") except AssertionError: raise RuntimeError("Access to window_size should be allowed as already initialized.") try: if not embedding.SL == negative_sample_size: raise RuntimeError("embedding.negative_sample_size == negative_sample_size should be true") except AssertionError: raise RuntimeError("Access to negative_sample_size should be allowed as already initialized.") try: if embedding.dictionary is not dictionary: raise RuntimeError("embedding.dictionary is dictionary should be true") except AssertionError: raise RuntimeError("Access to dictionary should be allowed as already initialized.") try: # Embedding internally deepcopy W to avoid unexpected change and # event vector for UNK and NIL are zero cleared. if not np.array_equal( embedding.W[len(EVENT_META_ENTITIES):], W[len(EVENT_META_ENTITIES):] ): raise RuntimeError("np.array_equal(embedding.W, W) should be true") except AssertionError: raise RuntimeError("Access to W should be allowed as already initialized.") try: if not isinstance(embedding.optimizer, SGD): raise RuntimeError("isinstance(embedding.optimizer, SGD) should be true") except AssertionError: raise RuntimeError("Access to optimizer should be allowed as already initialized.") try: opt = SGD() if not embedding.lr == opt.lr: raise RuntimeError("embedding.lr == lr should be true") except AssertionError: raise RuntimeError("Access to lr should be allowed as already initialized.") try: opt = SGD() if not embedding.l2 == opt.l2: raise RuntimeError("embedding.l2 == context_size should be true") except AssertionError: raise RuntimeError("Access to l2 should be allowed as already initialized.") try: if not np.array_equal(embedding.S["target_size"], embedding.E): raise RuntimeError("embedding.E == E should be true") except AssertionError: raise RuntimeError("Access to S['target_size'] should be allowed as already initialized.") try: if not np.array_equal(embedding.S["context_size"], embedding.C): raise RuntimeError("embedding.C == C should be true") except AssertionError: raise RuntimeError("Access to S['context_size'] should be allowed as already initialized.") try: if not np.array_equal(embedding.S["negative_sample_size"], embedding.SL): raise RuntimeError("embedding.SL == SL should be true") except AssertionError: raise RuntimeError("Access to S['negative_sample_size'] should be allowed as already initialized.") try: if not np.array_equal(embedding.S["W"], embedding.W): raise RuntimeError("embedding.W == W should be true") except AssertionError: raise RuntimeError("Access to S['W'] should be allowed as already initialized.") try: if not isinstance(embedding.logger, logging.Logger): raise RuntimeError("isinstance(embedding.logger, logging.Logger) should be true") except AssertionError: raise RuntimeError("Access to logger should be allowed as already initialized.") profiler.disable() profiler.print_stats(sort="cumtime")
def test_020_adapt_embedding_logistic_loss_instantiation_to_fail(): """ Objective: Verify the layer class validates the initialization parameter constraints. Expected: Initialization detects parameter constraints not meet and fails. """ name = "test_020_adapt_embedding_logistic_loss_instantiation_to_fail" # First validate the correct configuration, then change parameter one by one. dictionary: EventIndexing = _instantiate_event_indexing() target_size = TYPE_INT(np.random.randint(1, 10)) context_size = TYPE_INT(2 * np.random.randint(1, 10)) negative_sample_size = TYPE_INT(np.random.randint(5, 20)) event_vector_size: TYPE_INT = TYPE_INT(np.random.randint(5, 100)) W: TYPE_TENSOR = np.random.randn(dictionary.vocabulary_size, event_vector_size) _instantiate( name=name, num_nodes=TYPE_INT(1), target_size=target_size, context_size=context_size, negative_sample_size=negative_sample_size, event_vector_size=event_vector_size, dictionary=dictionary, W=W, log_level=logging.DEBUG, ) profiler = cProfile.Profile() profiler.enable() for _ in range(NUM_MAX_TEST_TIMES): target_size = TYPE_INT(np.random.randint(1, 10)) context_size = TYPE_INT(2 * np.random.randint(1, 10)) negative_sample_size = TYPE_INT(np.random.randint(5, 20)) event_vector_size: TYPE_INT = TYPE_INT(np.random.randint(5, 100)) msg = "Name is string with length > 0." _instantiate_must_fail(name="", num_nodes=(1 + negative_sample_size), target_size=target_size, context_size=context_size, negative_sample_size=negative_sample_size, event_vector_size=event_vector_size, dictionary=dictionary, log_level=logging.DEBUG, msg=msg) msg = "num_nodes must > 0." _instantiate_must_fail(name=name, num_nodes=TYPE_INT(0), target_size=target_size, context_size=context_size, negative_sample_size=negative_sample_size, event_vector_size=event_vector_size, dictionary=dictionary, log_level=logging.DEBUG, msg=msg) profiler.disable() profiler.print_stats(sort="cumtime")
def test_020_embedding_instance_properties_access_to_fail(caplog): """ Objective: Verify the layer class validates the parameters have been initialized before accessed. Expected: Initialization detects the access to the non-initialized parameters and fails. """ caplog.set_level(logging.DEBUG) name = "test_020_embedding_instance_properties_access_to_fail" dictionary: EventIndexing = _instantiate_event_indexing() profiler = cProfile.Profile() profiler.enable() for _ in range(NUM_MAX_TEST_TIMES): # First validate the correct configuration, then change parameter one by one. target_size = TYPE_INT(np.random.randint(1, 10)) context_size = TYPE_INT(2 * np.random.randint(1, 10)) negative_sample_size = TYPE_INT(np.random.randint(5, 20)) event_vector_size: TYPE_INT = TYPE_INT(np.random.randint(5, 100)) W: TYPE_TENSOR = np.random.randn(dictionary.vocabulary_size, event_vector_size) embedding, event_context = _must_succeed( name=name, num_nodes=(1+negative_sample_size), target_size=target_size, context_size=context_size, negative_sample_size=negative_sample_size, event_vector_size=event_vector_size, dictionary=dictionary, W=W, log_level=logging.DEBUG, msg="must succeed" ) # -------------------------------------------------------------------------------- # To fail # -------------------------------------------------------------------------------- msg = "Accessing uninitialized property of the layer must fail." try: print(embedding.X) raise RuntimeError(msg) except AssertionError: pass try: print(embedding.N) raise RuntimeError(msg) except AssertionError: pass try: print(embedding.dX) raise RuntimeError(msg) except AssertionError: pass try: print(embedding.dW) raise RuntimeError(msg) except AssertionError: pass try: print(embedding.Y) raise RuntimeError(msg) except AssertionError: pass try: print(embedding.dY) raise RuntimeError(msg) except AssertionError: pass profiler.disable() profiler.print_stats(sort="cumtime")
def test_020_embedding_instantiation_to_fail(): """ Objective: Verify the layer class validates the initialization parameter constraints. Expected: Initialization detects parameter constraints not meet and fails. """ name = "test_020_embedding_instantiation_to_fail" # First validate the correct configuration, then change parameter one by one. dictionary: EventIndexing = _instantiate_event_indexing() target_size = TYPE_INT(np.random.randint(1, 10)) context_size = TYPE_INT(2 * np.random.randint(1, 10)) negative_sample_size = TYPE_INT(np.random.randint(5, 20)) event_vector_size: TYPE_INT = TYPE_INT(np.random.randint(5, 100)) W: TYPE_TENSOR = np.random.randn(dictionary.vocabulary_size, event_vector_size) _must_succeed( name=name, num_nodes=TYPE_INT(1), target_size=target_size, context_size=context_size, negative_sample_size=negative_sample_size, event_vector_size=event_vector_size, dictionary=dictionary, W=W, log_level=logging.DEBUG, msg="must succeed" ) _must_succeed( name=name, num_nodes=(1+negative_sample_size), target_size=target_size, context_size=context_size, negative_sample_size=negative_sample_size, event_vector_size=event_vector_size, dictionary=dictionary, log_level=logging.DEBUG, msg="must succeed without W" ) profiler = cProfile.Profile() profiler.enable() for _ in range(NUM_MAX_TEST_TIMES): target_size = TYPE_INT(np.random.randint(1, 10)) context_size = TYPE_INT(2 * np.random.randint(1, 10)) negative_sample_size = TYPE_INT(np.random.randint(5, 20)) event_vector_size: TYPE_INT = TYPE_INT(np.random.randint(5, 100)) msg = "Name is string with length > 0." _must_fail( name="", num_nodes=(1 + negative_sample_size), target_size=target_size, context_size=context_size, negative_sample_size=negative_sample_size, event_vector_size=event_vector_size, dictionary=dictionary, log_level=logging.DEBUG, msg=msg ) msg = "num_nodes must > 0." _must_fail( name=name, num_nodes=TYPE_INT(0), target_size=target_size, context_size=context_size, negative_sample_size=negative_sample_size, event_vector_size=event_vector_size, dictionary=dictionary, log_level=logging.DEBUG, msg=msg ) # msg = "num_nodes is 1+negative_sample_size but does not has to be enforced" # _must_fail( # name=name, # num_nodes=TYPE_INT(1+negative_sample_size), # target_size=target_size, # context_size=context_size, # negative_sample_size=negative_sample_size, # event_vector_size=event_vector_size, # dictionary=dictionary, # log_level=logging.DEBUG, # msg=msg # ) msg = "target size must be >0." _must_fail( name=name, num_nodes=(1 + negative_sample_size), target_size=TYPE_INT(0), context_size=context_size, negative_sample_size=negative_sample_size, event_vector_size=event_vector_size, dictionary=dictionary, log_level=logging.DEBUG, msg=msg ) msg = "context_size must be >0." _must_fail( name=name, num_nodes=(1+negative_sample_size), target_size=target_size, context_size=TYPE_INT(0), negative_sample_size=negative_sample_size, event_vector_size=event_vector_size, dictionary=dictionary, log_level=logging.DEBUG, msg=msg ) msg = "negative_sample_size must be >0." _must_fail( name=name, num_nodes=(1+negative_sample_size), target_size=target_size, context_size=context_size, negative_sample_size=TYPE_INT(0), event_vector_size=event_vector_size, dictionary=dictionary, log_level=logging.DEBUG, msg=msg ) msg = "event_vector_size must be >0." _must_fail( name=name, num_nodes=(1+negative_sample_size), target_size=target_size, context_size=context_size, negative_sample_size=negative_sample_size, event_vector_size=TYPE_INT(0), dictionary=dictionary, log_level=logging.DEBUG, msg=msg ) msg = "dictionary must be of type EventIndexing" _must_fail( name=name, num_nodes=(1+negative_sample_size), target_size=target_size, context_size=context_size, negative_sample_size=negative_sample_size, event_vector_size=event_vector_size, dictionary=["hoge", None][np.random.randint(0, 2)], log_level=logging.DEBUG, msg=msg ) msg = "vocabulary_size must >= " \ "(context_size + target_size) + " \ "negative_sample_size + " \ "len(EVENT_META_ENTITIES)" length = (context_size + target_size) + negative_sample_size corpus = " ".join(str(i) for i in range(length)) _indexing_dummy = EventIndexing( name=__name__, num_nodes=1, corpus=corpus ) assert _indexing_dummy.vocabulary_size == length + len(EVENT_META_ENTITIES) _must_succeed( name=name, num_nodes=(1+negative_sample_size), target_size=TYPE_INT(target_size), context_size=TYPE_INT(context_size), negative_sample_size=TYPE_INT(negative_sample_size), event_vector_size=TYPE_INT(event_vector_size), dictionary=_indexing_dummy, log_level=logging.DEBUG, msg=msg ) _must_fail( name=name, num_nodes=(1+negative_sample_size), target_size=TYPE_INT(target_size) + 1, context_size=TYPE_INT(context_size), negative_sample_size=TYPE_INT(negative_sample_size), event_vector_size=TYPE_INT(event_vector_size), dictionary=_indexing_dummy, log_level=logging.DEBUG, msg=msg ) msg = "vocabulary_size must be > W.shape[0]" w = np.random.randn( dictionary.vocabulary_size - np.random.randint(1, dictionary.vocabulary_size), event_vector_size ) _must_fail( name=name, num_nodes=(1+negative_sample_size), target_size=TYPE_INT(target_size), context_size=TYPE_INT(context_size), negative_sample_size=TYPE_INT(negative_sample_size), event_vector_size=TYPE_INT(event_vector_size), dictionary=dictionary, W=w, log_level=logging.DEBUG, msg=msg ) msg = "event_vector_size must be W.shape[1]" offset = np.random.randint(1, event_vector_size) * (1 if random.random() < 0.5 else -1) assert (event_vector_size + offset) > 0, \ "%s %s %s" % (event_vector_size, offset, (event_vector_size + offset)) w = np.random.randn( dictionary.vocabulary_size, (event_vector_size + offset) ) _must_fail( name=name, num_nodes=(1+negative_sample_size), target_size=TYPE_INT(target_size), context_size=TYPE_INT(context_size), negative_sample_size=TYPE_INT(negative_sample_size), event_vector_size=TYPE_INT(event_vector_size), dictionary=dictionary, W=w, log_level=logging.DEBUG, msg=msg ) profiler.disable() profiler.print_stats(sort="cumtime")
def test_020_adapt_embedding_loss_adapter_function_ye_to_succeed(caplog): """ Objective: Verify the Adapter function handles Y in shape - Y:(N, 1+SL) - ys:(N,SL) - ye:(N,1) Expected: Adapter.function(Y) returns - For Y:(N, 1+SL), the return is in shape (N*(1+SL),1). Log loss T is set to the same shape - For Y:(N, SL), the return is in shape (N*SL,1). Log loss T is set to the same shape - For Y:(N,), the return is in shape (N,1). Log loss T is set to the same shape """ caplog.set_level(logging.DEBUG) name = "test_020_adapt_embedding_logistic_loss_function_multi_lines" sentences = """ Verify the EventIndexing function can handle multi line sentences the asbestos fiber <unk> is unusually <unk> once it enters the <unk> with even brief exposures to it causing symptoms that show up decades later researchers said """ dictionary: EventIndexing = _instantiate_event_indexing() profiler = cProfile.Profile() profiler.enable() for _ in range(NUM_MAX_TEST_TIMES): # First validate the correct configuration, then change parameter one by one. E = target_size = TYPE_INT(np.random.randint(1, 3)) C = context_size = TYPE_INT(2 * np.random.randint(1, 5)) SL = negative_sample_size = TYPE_INT(np.random.randint(1, 5)) event_vector_size: TYPE_INT = TYPE_INT(np.random.randint(5, 20)) W: TYPE_TENSOR = np.random.randn(dictionary.vocabulary_size, event_vector_size) loss, adapter, embedding, event_context = _instantiate( name=name, num_nodes=TYPE_INT(1), target_size=target_size, context_size=context_size, negative_sample_size=negative_sample_size, event_vector_size=event_vector_size, dictionary=dictionary, W=W, log_level=logging.DEBUG, ) sequences = dictionary.function(sentences) target_context_pairs = event_context.function(sequences) Y = embedding.function(target_context_pairs) N, _ = embedding.tensor_shape(Y) # ******************************************************************************** # Constraint: # - Adapter function returns (N,1) with the same values of ye # - Adapter function has set T:(N,1) with value 1 in the loss layer # ******************************************************************************** msg = "ye must succeed" ye = Y[::, 0] EZ = expected_Z = embedding.reshape(ye, shape=(N, 1)) Z = _function_must_succeed(adapter=adapter, Y=ye, msg=msg) assert embedding.all_close( Z, EZ, "Z must close to EZ. Z:\n%s\nEZ\n%s\nDiff\n%s\n" % (Z, EZ, (EZ - Z))) T = np.ones(shape=(N, 1), dtype=TYPE_LABEL) assert embedding.all_equal(T, loss.T), \ "Expected T must equals loss.T. Expected\n%s\nLoss.T\n%s\n" % (T, loss.T) profiler.disable() profiler.print_stats(sort="cumtime")
def test_020_cross_entropy_log_loss_1d(caplog): """ Objective: Test the categorical log loss values for P in 1 dimension. Constraints: 1. The numerical gradient gn = (-t * logarithm(p+h) + t * logarithm(p-h)) / 2h. 2. The numerical gradient gn is within +/- u within the analytical g = -T/P. P: Probabilities from softmax of shape (M,) M: Number of nodes in the cross_entropy_log_loss layer. T: Labels Note: log(P=1) -> 0 dlog(x)/dx = 1/x """ def f(P: np.ndarray, T: np.ndarray): return np.sum(cross_entropy_log_loss(P, T)) # caplog.set_level(logging.DEBUG, logger=Logger.name) h: TYPE_FLOAT = OFFSET_DELTA u: TYPE_FLOAT = GRADIENT_DIFF_ACCEPTANCE_VALUE # -------------------------------------------------------------------------------- # For (P, T): P[index] = True/1, OHE label T[index] = 1 where # P=[0,0,0,...,1,...0], T = [0,0,0,...1,...0]. T[i] == 1 # # Do not forget the Jacobian shape is (N,) and calculate each element. # 1. For T=1, loss L = -log(Pi) = 0 and dL/dP=(1/Pi)= -1 is expected. # 2. For T=0, Loss L = (-log(0+offset+h)-log(0+offset-h)) / 2h = 0 is expected. # -------------------------------------------------------------------------------- M: TYPE_INT = np.random.randint(2, NUM_MAX_NODES) index: TYPE_INT = TYPE_INT(np.random.randint( 0, M)) # Position of the true label in P P1 = np.zeros(M, dtype=TYPE_FLOAT) P1[index] = TYPE_FLOAT(1.0) T1 = np.zeros(M, dtype=TYPE_LABEL) T1[index] = TYPE_LABEL(1) # Analytica correct gradient for P=1, T=1 AG = np.zeros_like(P1, dtype=TYPE_FLOAT) AG[index] = TYPE_FLOAT(-1) # dL/dP = -1 EGN1 = np.zeros_like(P1, dtype=TYPE_FLOAT) # Expected numerical gradient EGN1[index] = (-1 * logarithm(TYPE_FLOAT(1.0 + h)) + TYPE_FLOAT(1) * logarithm(TYPE_FLOAT(1.0 - h))) / TYPE_FLOAT(2 * h) assert np.all(np.abs(EGN1-AG) < u), \ "Expected EGN-1<%s but %s\nEGN=\n%s" % (u, (EGN1-AG), EGN1) GN1 = numerical_jacobian(partial(f, T=T1), P1) assert np.all(np.abs(GN1-AG) < u), \ "Expected GN-1<%s but %s\nGN=\n%s" % (u, (GN1-AG), GN1) # The numerical gradient gn = (-t * logarithm(p+h) + t * logarithm(p-h)) / 2h assert GN1.shape == EGN1.shape assert np.all(np.abs(EGN1-GN1) < u), \ "Expected GN1==EGN1 but GN1-EGN1=\n%sP=\n%s\nT=%s\nEGN=\n%s\nGN=\n%s\n" \ % (np.abs(GN1-EGN1), P1, T1, EGN1, GN1) # The numerical gradient gn is within +/- u within the analytical g = -T/P G1 = np.zeros_like(P1, dtype=TYPE_FLOAT) G1[T1 == 1] = -1 * (T1[index] / P1[index]) # G1[T1 != 0] = 0 check.equal(np.all(np.abs(G1 - GN1) < u), True, "G1-GN1 %s\n" % np.abs(G1 - GN1)) # -------------------------------------------------------------------------------- # For (P, T): P[index] = np uniform(), index label T=index # -------------------------------------------------------------------------------- for _ in range(NUM_MAX_TEST_TIMES): M = np.random.randint(2, NUM_MAX_NODES) # M > 1 T2 = TYPE_LABEL(np.random.randint(0, M)) # location of the truth P2 = np.zeros(M, dtype=TYPE_FLOAT) while not (x := TYPE_FLOAT( np.random.uniform(low=-BOUNDARY_SIGMOID, high=BOUNDARY_SIGMOID))): pass p = softmax(x) P2[T2] = p # -------------------------------------------------------------------------------- # The Jacobian G shape is the same with P.shape. # G:[0, 0, ...,g, 0, ...] where Gi is numerical gradient close to -1/(1+k). # -------------------------------------------------------------------------------- N2 = np.zeros_like(P2, dtype=TYPE_FLOAT) N2[T2] = TYPE_FLOAT(-1) * (logarithm(p + h) - logarithm(p - h)) / TYPE_FLOAT(2 * h) N2 = numerical_jacobian(partial(f, T=T2), P2) # The numerical gradient gn = (-t * logarithm(p+h) + t * logarithm(p-h)) / 2h assert N2.shape == N2.shape assert np.all(np.abs(N2-N2) < u), \ f"Delta expected to be < {u} but \n{np.abs(N2-N2)}" G2 = np.zeros_like(P2, dtype=TYPE_FLOAT) G2[T2] = -1 / p # The numerical gradient gn is within +/- u within the analytical g = -T/P check.equal(np.all(np.abs(G2 - N2) < u), True, "G2-N2 %s\n" % np.abs(G2 - N2))
def test_020_embedding_gradient_descent(caplog): """ Objective: Verify the gradient descent, especially np.ufunc.at, is working as expected. W:(V, D=3) where all elements are initialized to 1.0. dL/dY:(1,E+SL) = I and E=1,SL=1 X=(target,context)=[3,4,5,6,5] where target_index=3. Expected: The context index 5 occurs twice so that W[5] should be updated twice at the gradient descent as W[5] = W[5] - [lr * (1 + l2)] * 2. For other context indices at 4, 6: W[3] = W[3] - [lr * (1 + l2) * dWe]. W[4] = W[4] - [lr * (1 + l2) * dWc]. W[6] = W[6] - [lr * (1 + l2) * dWc]. """ caplog.set_level(logging.DEBUG) name = "test_020_embedding_gradient_multi_lines" dictionary: EventIndexing = _instantiate_event_indexing() def L(x): loss = Function.sum( x, axis=None, keepdims=False ) return loss target_size = negative_sample_size = TYPE_INT(1) context_size = TYPE_INT(4) event_vector_size = TYPE_INT(3) W: TYPE_TENSOR = np.ones( shape=(dictionary.vocabulary_size, event_vector_size), dtype=TYPE_FLOAT ) embedding, event_context = _must_succeed( name=name, num_nodes=(1 + negative_sample_size), target_size=target_size, context_size=context_size, negative_sample_size=negative_sample_size, event_vector_size=event_vector_size, dictionary=dictionary, W=W, log_level=logging.DEBUG, msg="must succeed" ) del W # embedding deepcopy W to avoid unexpected changes embedding.objective = L target_context_pairs = np.array([[3, 4, 5, 6, 5]], dtype=TYPE_INT) # -------------------------------------------------------------------------------- # Forward path # -------------------------------------------------------------------------------- Y = embedding.function(target_context_pairs) EDWe, EDWs, EDWc = embedding.gradient_numerical() print(f"Loss {L(Y)}\n") # -------------------------------------------------------------------------------- # Backward path # -------------------------------------------------------------------------------- dY = Function.ones(shape=Function.tensor_shape(Y)) embedding.gradient(dY) # -------------------------------------------------------------------------------- # Expected We, Wc (we do not know Ws as negative sample is stochastic) # This is for SGD as the optimizer. # -------------------------------------------------------------------------------- lr = embedding.lr l2 = embedding.l2 expected_dWe = lr * (1+l2) * embedding.dWe diff_We = embedding.optimizer.differential(dW=embedding.dWe) msg_We = "dWe: expected\n%s\n but actual diff=:\n%s\n" % \ (expected_dWe, (expected_dWe-diff_We)) embedding.all_close( expected_dWe, diff_We, msg=msg_We ) EWe = embedding.W[3] - expected_dWe expected_dWc = lr * (1+l2) * embedding.dWc diff_Wc = embedding.optimizer.differential(dW=embedding.dWc) msg_Wc = "dWc: expected\n%s\n but actual diff=:\n%s\n" % \ (expected_dWc, (expected_dWc-diff_Wc)) embedding.all_close( expected_dWc, diff_Wc, msg=msg_Wc ) EWc4 = np.subtract(embedding.W[4], expected_dWc) EWc5 = np.subtract(embedding.W[5], expected_dWc * 2) EWc6 = np.subtract(embedding.W[6], expected_dWc) # -------------------------------------------------------------------------------- # Backward path: Gradient descent # -------------------------------------------------------------------------------- assert np.array_equal(embedding.target_indices, np.array([3], dtype=TYPE_INT)) assert np.array_equal(embedding.context_indices, np.array([4, 5, 6, 5], dtype=TYPE_INT)) dWe, dWs, dWc = embedding.update() # ******************************************************************************** # Constraint: # - dW is close to EDW # - dL/dWe = dL/dWs = Bc when dL/dY = I # ******************************************************************************** assert Function.all_close( EDWe, dWe, msg="Expected (EDWe==dWe)\n%s\ndifference\n%s\n" % (EDWe, EDWe - dWe) ) assert Function.all_close( EDWs, dWs, msg="Expected (EDWs==dWs)\n%s\ndifference\n%s\n" % (EDWs, EDWs - dWs) ) assert Function.all_close( EDWc, dWc, msg="Expected (EWc5==W[5])\n%s\ndifference\n%s\n" % (EDWc, EDWc - dWc) ) assert Function.all_close( dWe, dWs, msg="Expected (dWe==dWs) but dWe:\n%s\ndifference\n%s\n" % (dWe, dWe - dWs) ) # ******************************************************************************** # Constraint: # ******************************************************************************** assert np.array_equal(expected_dWe, lr * (1+l2) * dWe) assert np.array_equal(expected_dWc, lr * (1+l2) * dWc) # - W[3] = W[3] - [lr * (1 + l2) * dWe]. assert Function.all_close( # EWe, embedding.W[3], msg="Expected (EDWe==W[3])\n%s\ndifference\n%s\n" % (EWe, EWe - embedding.W[3]) EWe, embedding.WO[3], msg="Expected (EDWe==W[3])\n%s\ndifference\n%s\n" % (EWe, EWe - embedding.W[3]) ) # W[4] = W[4] - [lr * (1 + l2) * dWc] assert Function.all_close( EWc4, embedding.W[4], msg="Expected (EWc4==W[4])\n%s\ndifference\n%s\n" % (EWc4, EWc4 - embedding.W[4]) ) # W[5] = W[5] - [lr * (1 + l2) * 2 * dWc] assert Function.all_close( EWc5, embedding.W[5], msg="Expected (EWc5==W[5])\n%s\ndifference\n%s\n" % (EWc5, EWc5 - embedding.W[5]) ) # W[6] = W[6] - [lr * (1 + l2) * dWc] assert Function.all_close( EWc6, embedding.W[6], msg="Expected (EWc6==W[6])\n%s\ndifference\n%s\n" % (EWc6, EWc6 - embedding.W[6]) )
def sentence_to_sequence( sentences: str, event_to_index: Dict[str, TYPE_INT], minimum_length: TYPE_INT = TYPE_INT(0) ) -> List[List[TYPE_INT]]: """Generate sequence of event indices from a text sentences 1. Skip an empty line or a line only contain non-word e.g punctuation. 2. Return [[]] if there is no sequence generated. 3. Pad each sequence to the length of max(sequence_len, or min_length). Sentence length varies and a vectorization framework e.g. numpy may require same length rows, e.g. numpy array cannot handle ragged numeric rows. Hence pad a sequence to align, when minimum_length > 0. A sentence can be short e.g. "I am". To create (event, context) pair, minimum (event_length+context_length) is required. If a generated sequence length < minimum_length, then pad it to meet the min length. Args: sentences: A string including one ore more sentences to process. A sentence is delimited by EOL('\n'). event_to_index: event to integer index mapping dictionary minimum_length: minimum length of a generated sequence. Returns: List of integer sequence per sentence """ assert isinstance(sentences, str) sequences = [] max_sequence_length = 0 for line in sentences.split(EOL): if len(line.strip()) > 0: # Skip empty line sequence = [ event_to_index.get( w, EVENT_META_ENTITY_TO_INDEX[EVENT_UNK.lower()]) for w in Function.standardize(line).split() ] # A line may have punctuations only which results in null sequence if len(sequence) > 0: max_sequence_length = max(max_sequence_length, len(sequence)) sequences.append(sequence) else: Logger.warning("Sentence is empty. Skipping...") if len(sequences) > 0: if minimum_length > 0: # padding required max_sequence_length = max(max_sequence_length, minimum_length) padded: List[List[TYPE_INT]] = [ np.pad(array=seq, pad_width=(0, max_sequence_length - len(seq)), constant_values=EVENT_META_ENTITY_TO_INDEX[ EVENT_NIL.lower()]).astype(TYPE_INT).tolist() for seq in sequences ] del sequences else: padded = sequences else: Logger.warning( "Return [[]] as no valid sentences in the input \n[%s]\n", sentences) padded = [[]] Logger.debug("Sequences generated for \n%s\n%s", sentences, padded) return padded
def test_020_adapt_embedding_loss_adapter_gradient_to_succeed(caplog): """ Objective: Verify the Adapter gradient method handles dY in shape (N, 1+SL) Adapter.function(Y) returns - For Y:(N, 1+SL), the return is in shape (N*(1+SL),1). Log loss T is set to the same shape Expected: """ caplog.set_level(logging.DEBUG) name = "test_020_adapt_embedding_logistic_loss_function_multi_lines" sentences = """ Verify the EventIndexing function can handle multi line sentences the asbestos fiber <unk> is unusually <unk> once it enters the <unk> with even brief exposures to it causing symptoms that show up decades later researchers said """ dictionary: EventIndexing = _instantiate_event_indexing() profiler = cProfile.Profile() profiler.enable() for _ in range(NUM_MAX_TEST_TIMES): # First validate the correct configuration, then change parameter one by one. E = target_size = TYPE_INT(np.random.randint(1, 3)) C = context_size = TYPE_INT(2 * np.random.randint(1, 5)) SL = negative_sample_size = TYPE_INT(np.random.randint(1, 5)) event_vector_size: TYPE_INT = TYPE_INT(np.random.randint(5, 20)) W: TYPE_TENSOR = np.random.rand(dictionary.vocabulary_size, event_vector_size) loss, adapter, embedding, event_context = _instantiate( name=name, num_nodes=TYPE_INT(1), target_size=target_size, context_size=context_size, negative_sample_size=negative_sample_size, event_vector_size=event_vector_size, dictionary=dictionary, W=W, log_level=logging.DEBUG, ) # ================================================================================ # Forward path # ================================================================================ # -------------------------------------------------------------------------------- # Event indexing # -------------------------------------------------------------------------------- sequences = dictionary.function(sentences) # -------------------------------------------------------------------------------- # Event context pairs # -------------------------------------------------------------------------------- target_context_pairs = event_context.function(sequences) # -------------------------------------------------------------------------------- # Embedding # -------------------------------------------------------------------------------- Y = embedding.function(target_context_pairs) N, _ = embedding.tensor_shape(Y) batch_size = TYPE_FLOAT(N * (1 + SL)) # -------------------------------------------------------------------------------- # Adapter # -------------------------------------------------------------------------------- Z = adapter.function(Y) # -------------------------------------------------------------------------------- # Loss # -------------------------------------------------------------------------------- L = loss.function(Z) # ******************************************************************************** # Constraint: # loss.T is set to the T by adapter.function() # ******************************************************************************** T = np.zeros(shape=(N, (1 + SL)), dtype=TYPE_LABEL) T[::, 0] = TYPE_LABEL(1) assert embedding.all_equal(T.reshape(-1, 1), loss.T), \ "Expected T must equals loss.T. Expected\n%s\nLoss.T\n%s\n" % (T, loss.T) # ******************************************************************************** # Constraint: # Expected loss is sum(sigmoid_cross_entropy_log_loss(Y, T)) / (N*(1+SL)) # The batch size for the Log Loss is (N*(1+SL)) # ******************************************************************************** EJ, EP = sigmoid_cross_entropy_log_loss(X=Z, T=T.reshape(-1, 1)) EL = np.sum(EJ, dtype=TYPE_FLOAT) / batch_size assert embedding.all_close(EL, L), \ "Expected EL=L but EL=\n%s\nL=\n%s\nDiff=\n%s\n" % (EL, L, (EL-L)) # ================================================================================ # Backward path # ================================================================================ # ******************************************************************************** # Constraint: # Expected dL/dY from the Log Loss is (P-T)/N # ******************************************************************************** EDY = (sigmoid(Y) - T.astype(TYPE_FLOAT)) / batch_size assert EDY.shape == Y.shape dY = adapter.gradient(loss.gradient(TYPE_FLOAT(1))) assert dY.shape == Y.shape assert embedding.all_close(EDY, dY), \ "Expected EDY==dY. EDY=\n%s\nDiff\n%s\n" % (EDY, (EDY-dY)) profiler.disable() profiler.print_stats(sort="cumtime")