def test_report_to_kv_store(self): opt = SGD(momentum=0.1) opt_wrapper = OptimizerWrapper(opt, None, {}) ids_list = [[1, 5], [10]] opt_wrapper._unique_ids_all_layers = { "test_1": np.array(ids_list[0]), "test_2": np.array(ids_list[1]), } t = np.array([1.0, 1.0, 1.0]) opt_wrapper._embed_variables = { "test_1": tf.Variable([t, t * 5]), "test_2": tf.Variable([t * 10]), } opt_wrapper._slot_variables = { "test_1": {"momentum": tf.Variable([t / 10.0, t / 2.0])}, "test_2": {"momentum": tf.Variable([t])}, } mock_kv_store = MockKvStore({}) with mock.patch.object( EmbeddingService, "update_embedding", mock_kv_store.update ): opt_wrapper._report_to_kv_store() expected_mock_kv_store = MockKvStore({}) expected_mock_kv_store.update( keys=["test_1-1", "test_1-5", "test_2-10"], values=[t, t * 5.0, t * 10.0], ) expected_mock_kv_store.update( keys=[ "test_1-momentum-1", "test_1-momentum-5", "test_2-momentum-10", ], values=[t / 10.0, t / 2.0, t], ) for k, ids in zip(["test_1", "test_2"], ids_list): for id in ids: key = Embedding.get_key([k, id]) v, _ = mock_kv_store.lookup([key]) expected_v, _ = expected_mock_kv_store.lookup([key]) self.assertTrue((v[0] == expected_v[0]).all())
def test_set_embedding_values_to_variables(self): layers = ["test-1", "test-2"] id_num = 3 embedding_dims = {layer: 4 for layer in layers} all_values = np.arange(16).reshape(4, 4) embedding_values = {} offset = 0 for layer in layers: start = offset end = offset + id_num embedding_values.setdefault(layer, all_values[start:end]) offset = end opt = SGD() opt_wrapper = OptimizerWrapper(opt, None, embedding_dims) grads_and_vars = [ ("test-1-grads", "test-1"), ("test-2-grads", "test-2"), ] opt_wrapper._set_embedding_values_to_variables( grads_and_vars, embedding_values ) for i, layer in enumerate(layers): self.assertTrue( ( opt_wrapper._embed_variables[layer].numpy() == embedding_values[layer] ).all() ) self.assertTrue( ( grads_and_vars[i][1].numpy() == opt_wrapper._embed_variables[layer].numpy() ).all() ) embedding_values_new = {"test-1": np.zeros((3, 4), np.float32)} grads_and_vars = [("test-1-grads", "test-1")] opt_wrapper._set_embedding_values_to_variables( grads_and_vars, embedding_values_new ) self.assertTrue( (opt_wrapper._embed_variables["test-1"].numpy() < 0.0001).all() )
def _compare_initialize_values(self, opt, dim, slot, expected_init): tmp = OptimizerWrapper(opt, None, {"test": dim}) self.assertTrue( (tmp._initialize_unknown_slot("test", slot) - expected_init(dim) < 0.0001).all())
def _compare_slot_names(self, opt, expected): tmp = OptimizerWrapper(opt, None, {}) self.assertTrue(sorted(tmp.allowed_slot_names) == sorted(expected))
def _train_edl_embedding_with_optimizer_wrapper(model, opt_keras, X, Y, loss_fn, params, random_seed): """Train model with optimizer wrapper.""" tf.random.set_seed(random_seed) opt_wrapper = OptimizerWrapper( opt_keras, lookup_embedding_func=params.get_embedding_param, update_embedding_func=params.set_embedding_param, ) embed_layers = find_layer(model, Embedding) # initialize slot params params.create_slot_params(opt_wrapper.allowed_slot_names, opt_wrapper.slot_initial_value) # initialize ElasticDL embedding layer for layer in embed_layers: layer.set_lookup_embedding_func(params.get_embedding_param) # training process for train_iter, (features, labels) in enumerate(zip(X, Y)): with tf.GradientTape() as tape: for layer in embed_layers: layer.set_tape(tape) outputs = model.call(features) loss = loss_fn(outputs, labels) # Need to get non-embedding variables inside for loop because model # creates variables after the first time `model.call` is called if not train_iter: non_embed_vars = get_non_embedding_trainable_vars( model, embed_layers) embed_items = [] for layer in embed_layers: embed_items.extend([(bet, layer.name, ids) for bet, ids in layer.embedding_and_ids]) grads = tape.gradient( loss, non_embed_vars + [var for var, _, _ in embed_items]) # TODO: do not need to merge gradient from the same embedding layer # after `optimizer_wrapper` support grads_and_vars with duplicated # layer name non_embed_vars_n = len(non_embed_vars) non_embed_grads = grads[:non_embed_vars_n] embed_grads_dict = {} for (_, layer_name, ids), grad in zip(embed_items, grads[non_embed_vars_n:]): if layer_name in embed_grads_dict: merged_grads = embed_grads_dict[layer_name] embed_grads_dict[layer_name] = tf.IndexedSlices( tf.concat([merged_grads.values, grad.values], axis=0), tf.concat([merged_grads.indices, ids], axis=0), ) else: embed_grads_dict[layer_name] = tf.IndexedSlices( grad.values, ids) opt_wrapper.apply_gradients( list(zip(non_embed_grads, non_embed_vars)) + [(grad, layer_name) for layer_name, grad in embed_grads_dict.items()]) for layer in embed_layers: layer.reset()
def _test_async_correctness( self, grads_and_vars_batches, embed_values, expected_non_embed_values, expected_embed_values=None, ): """Checks the correctness of async OptimizerWrapper. This function creates many threads and these threads call `OptimizerWrapper.apply_gradients` simultaneously. Args: grads_and_vars_batches: A python list of `grads_and_vars`. Every thread takes a `grads_and_vars` and calls `apply_gradients`. embed_values: A python dictionary of `(layer_name, embedding table)`. expected_non_embed_values: A python list of expected non-embdding values after applying gradients. expected_embed_values: A python dictionary of expected embedding values after applying gradients. None means no need to check embedding values. """ thread_num = len(grads_and_vars_batches) input_dims = {} embed_var_n = len(embed_values) params = Parameters() for layer, values in embed_values.items(): embed_dim = values.shape[1] input_dims[layer] = values.shape[0] embed_table = EmbeddingTable(layer, embed_dim) embed_table.set(range(input_dims[layer]), values) params.embedding_params[layer] = embed_table opt = SGD(0.1) opt_wrapper = OptimizerWrapper( opt, True, lookup_embedding_func=params.get_embedding_param, update_embedding_func=params.set_embedding_param, ) # call optimizer_wrapper.apply_gradients asynchronously def _apply_gradients(opt_wrapper, grads_and_vars): # sleep 1s to wait that all threads are in this method call time.sleep(1) opt_wrapper.apply_gradients(grads_and_vars) executor = ThreadPoolExecutor(max_workers=thread_num) tasks = [ executor.submit(_apply_gradients, opt_wrapper, grads_and_vars) for grads_and_vars in grads_and_vars_batches ] _ = [task.result() for task in tasks] # check updated results of non-embedding variables non_embed_vars = [ var for grad, var in grads_and_vars_batches[0][:-embed_var_n] ] for var, expected_value in zip(non_embed_vars, expected_non_embed_values): self.assertTrue(np.isclose(var.numpy(), expected_value).all()) # `expected_embed_values=None` means that no need to check # embedding table if not expected_embed_values: return # check updated results of embedding table for layer, expected_values in expected_embed_values.items(): value = params.get_embedding_param(layer, range(input_dims[layer])) self.assertTrue( any([ np.isclose(value, expected).all() for expected in expected_values ]))
def _train_edl_embedding_with_optimizer_wrapper( model, opt_keras, X, Y, loss_fn, embed_dims, random_seed ): """Train model with optimizer wrapper.""" tf.random.set_seed(random_seed) optimizer = OptimizerWrapper(opt_keras, None, embed_dims) # initialization process related to embedding layer and optimizer wrapper embed_layers = find_layer(model, Embedding) def lookup_func(ids, layer_name, initializer, output_dim): values, unknown = EmbeddingService.lookup_embedding( [Embedding.get_key([layer_name, i]) for i in ids] ) return np.concatenate(values).reshape(len(ids), -1) for layer in embed_layers: layer.set_lookup_func(lookup_func) # training process for features, labels in zip(X, Y): with tf.GradientTape() as tape: for layer in embed_layers: layer.set_tape(tape) outputs = model.call(features) loss = loss_fn(outputs, labels) # TODO: calculate train_vars_embed and train_vars_other can be a # reusable function train_vars_embed = [] train_vars_other = [] for layer in model.layers: if isinstance(layer, Embedding): for bet, ids in layer.bet_ids_pair: train_vars_embed.append((bet, layer.name, ids)) else: vars = layer.trainable_variables train_vars_other.extend(vars) grads = tape.gradient( loss, train_vars_other + [var for var, _, _ in train_vars_embed] ) # TODO: do not need to merge gradient from the same embedding layer # after `optimizer_wrapper` support grads_and_vars with duplicated # layer name train_vars_other_len = len(train_vars_other) grads_new = grads[:train_vars_other_len] grads_embed_dict = {} for (_, layer_name, ids), grad in zip( train_vars_embed, grads[train_vars_other_len:] ): if layer_name in grads_embed_dict: grads_merged = grads_embed_dict[layer_name] grads_embed_dict[layer_name] = tf.IndexedSlices( tf.concat([grads_merged.values, grad.values], axis=0), tf.concat([grads_merged.indices, ids], axis=0), ) else: grads_embed_dict[layer_name] = tf.IndexedSlices( grad.values, ids ) optimizer.apply_gradients( list(zip(grads_new, train_vars_other)) + [ (grad, layer_name) for layer_name, grad in grads_embed_dict.items() ] ) for layer in embed_layers: layer.reset()
def test_set_slot_values_to_variables(self): layers = ["test-1", "test-2"] slots = ["m", "v"] id_num = 3 embedding_dims = {layer: 4 for layer in layers} all_values = np.arange(48).reshape(12, 4).astype(np.float32) slot_values = {} offset = 0 for layer in layers: for slot in slots: start = offset end = offset + id_num slot_values.setdefault(layer, {}).setdefault( slot, all_values[start:end] ) offset = end opt = Adam() opt_wrapper = OptimizerWrapper(opt, None, embedding_dims) for layer in layers: opt_wrapper._create_embedding_variable(layer, tf.zeros((1, 4))) opt_wrapper._set_slot_values_to_variables(slot_values) self.assertTrue(len(opt.weights) == 4) for layer in layers: slots_dict = None for k, v in opt._slots.items(): if k.startswith(layer): slots_dict = v break for slot in slots: self.assertTrue( ( slots_dict[slot].numpy() == slot_values[layer][slot] ).all() ) self.assertTrue( ( slots_dict[slot].numpy() == opt_wrapper._slot_variables[layer][slot].numpy() ).all() ) slots_dict[slot].assign(tf.ones((10, 4))) self.assertTrue( ( opt_wrapper._slot_variables[layer][slot].numpy() - 1.0 < 0.0001 ).all() ) opt_wrapper._slot_variables[layer][slot].assign( -tf.ones((10, 4)) ) self.assertTrue( (slots_dict[slot].numpy() + 1.0 < 0.0001).all() ) slot_values_new = {"test-1": {"m": np.zeros((3, 4), np.float32)}} opt_wrapper._set_slot_values_to_variables(slot_values_new) self.assertTrue( (opt_wrapper._slot_variables["test-1"]["m"].numpy() < 0.0001).all() )
def _init_optimizer(self, opt, use_async): self._modulate_lr_if_needed(opt) if opt: return OptimizerWrapper(opt, use_async) return opt
def _test_async_correctness( self, grads_and_vars_batches, embed_values, expected_non_embed_values, expected_embed_values=None, ): """Checks the correctness of async OptimizerWrapper. This function creates many threads and these threads call `OptimizerWrapper.apply_gradients` simultaneously. Args: grads_and_vars_batches: A python list of `grads_and_vars`. Every thread takes a `grads_and_vars` and calls `apply_gradients`. embed_values: A python dictionary of `(layer_name, embedding table)`. expected_non_embed_values: A python list of expected non-embdding values after applying gradients. expected_embed_values: A python dictionary of expected embedding values after applying gradients. None means no need to check embedding values. """ thread_num = len(grads_and_vars_batches) embed_dims = {} embed_var_n = len(embed_values) mock_kv_store = MockKvStore() for layer, values in embed_values.items(): embed_dims[layer] = values.shape[1] input_dim = values.shape[0] keys = [ Embedding.get_key([layer, idx]) for idx in range(input_dim) ] mock_kv_store.update(keys, values) opt = SGD(0.1) opt_wrapper = OptimizerWrapper(opt, None, embed_dims, True) with mock.patch.object(EmbeddingService, "lookup_embedding", mock_kv_store.lookup), mock.patch.object( EmbeddingService, "update_embedding", mock_kv_store.update): # call optimizer_wrapper.apply_gradients asynchronously def _apply_gradients(opt_wrapper, grads_and_vars): # sleep 1s to wait that all threads are in this method call time.sleep(1) opt_wrapper.apply_gradients(grads_and_vars) executor = ThreadPoolExecutor(max_workers=thread_num) tasks = [ executor.submit(_apply_gradients, opt_wrapper, grads_and_vars) for grads_and_vars in grads_and_vars_batches ] _ = [task.result() for task in tasks] # check updated results of non-embedding variables non_embed_vars = [ var for grad, var in grads_and_vars_batches[0][:-embed_var_n] ] for var, expected_value in zip(non_embed_vars, expected_non_embed_values): self.assertTrue(np.isclose(var.numpy(), expected_value).all()) # `expected_embed_values=None` means that no need to check # embedding table if not expected_embed_values: return # check updated results of embedding table for layer, expected_values in expected_embed_values.items(): keys = [ Embedding.get_key([layer, idx]) for idx in range(input_dim) ] raw_value, _ = mock_kv_store.lookup(keys) value = np.concatenate(raw_value).reshape(input_dim, -1) self.assertTrue( any([ np.isclose(value, expected).all() for expected in expected_values ]))