def test_delete_variables(self): params = Parameters() embed_layers = ["test_1", "test_2"] slot_names = ["m", "v"] dim = 8 for layer in embed_layers: params.embedding_params[layer] = EmbeddingTable(layer, dim) for slot in slot_names: slot_key = get_slot_table_name(layer, slot) params.embedding_params[slot_key] = EmbeddingTable( slot_key, dim, "0.0", True) opt = Adam() opt_wrapper = OptimizerWrapper(opt, None, params.get_embedding_param, params.set_embedding_param) opt_wrapper._init_thread_local() for name in embed_layers: opt_wrapper._tls._unique_ids_all_layers[name] = np.ndarray( [2], np.int32) opt_wrapper._create_embedding_variable( name, np.ndarray([2, dim], np.float32)) opt_wrapper._get_slot_and_set_to_optimizer(name) self.assertTrue(len(opt._weights) == 4) self.assertTrue(len(opt._slots) == 2) for slot_dict in opt._slots.values(): self.assertTrue(len(slot_dict) == 2) opt_wrapper._delete_slots_and_weights_in_optimizer() self.assertTrue(len(opt._weights) == 0) self.assertTrue(len(opt._slots) == 0)
def wrap_optimizer(self): self._optimizer = OptimizerWrapper( self._optimizer, self._use_async, self._parameters.get_embedding_param, self._parameters.set_embedding_param, )
def test_set_slot_to_optimizer(self): embed_name = "test_emb" indices = np.ndarray([2], dtype=np.int32) embed_values = np.ndarray([2, 2], dtype=np.float32) slot_values = { "m": np.ndarray([2, 2], dtype=np.float32), "v": np.ndarray([2, 2], dtype=np.float32), } params = Parameters() params.embedding_params[embed_name] = EmbeddingTable(embed_name, 8) for slot in ["m", "v"]: slot_table_name = get_slot_table_name(embed_name, slot) params.embedding_params[slot_table_name] = EmbeddingTable( slot_table_name, 2, "0.0", True) opt = Adam() opt_wrapper = OptimizerWrapper(opt, None, params.get_embedding_param) opt_wrapper._init_thread_local() opt_wrapper._tls._unique_ids_all_layers[embed_name] = indices opt_wrapper._create_embedding_variable(embed_name, embed_values) opt_wrapper._get_slot_and_set_to_optimizer(embed_name) self.assertEqual(len(opt._slots), 1) opt_slots = list(opt._slots.values())[0] self.assertEqual(sorted(opt_slots.keys()), ["m", "v"]) for name in ["m", "v"]: self.assertTrue( np.allclose(opt_slots[name].numpy(), slot_values[name]))
def test_update_embedding_param(self): params = Parameters() for name in ["test_1", "test_2"]: params.embedding_params[name] = EmbeddingTable(name, 8) slot_key = get_slot_table_name(name, "momentum") params.embedding_params[slot_key] = EmbeddingTable( slot_key, 8, "0.0", True) indices = { "test_1": np.array([1, 5]), "test_2": np.array([10]), } embed_vars = { "test_1": tf.Variable(np.random.rand(2, 8).astype(np.float32)), "test_2": tf.Variable(np.random.rand(1, 8).astype(np.float32)), } slot_vars = { "test_1": { "momentum": tf.Variable(np.random.rand(2, 8).astype(np.float32)) }, "test_2": { "momentum": tf.Variable(np.random.rand(1, 8).astype(np.float32)) }, } opt = SGD(momentum=0.1) opt_wrapper = OptimizerWrapper(opt, None, None, params.set_embedding_param) opt_wrapper._tls._unique_ids_all_layers = indices opt_wrapper._tls._embed_variables = embed_vars opt_wrapper._tls._slot_variables = slot_vars opt_wrapper._update_embedding_param() for name in ["test_1", "test_2"]: self.assertTrue( np.allclose( embed_vars[name].numpy(), params.get_embedding_param(name, indices[name]), )) slot = "momentum" slot_table_name = get_slot_table_name(name, slot) self.assertTrue( np.allclose( slot_vars[name][slot].numpy(), params.get_embedding_param(slot_table_name, indices[name]), ))
def _train_edl_embedding_with_optimizer_wrapper(model, opt_keras, X, Y, loss_fn, params, random_seed): """Train model with optimizer wrapper.""" tf.random.set_seed(random_seed) opt_wrapper = OptimizerWrapper( opt_keras, lookup_embedding_func=params.get_embedding_param, update_embedding_func=params.set_embedding_param, ) embed_layers = find_layer(model, Embedding) # initialize slot params params.create_slot_params(opt_wrapper.allowed_slot_names, opt_wrapper.slot_initial_value) # initialize ElasticDL embedding layer for layer in embed_layers: layer.set_lookup_embedding_func(params.get_embedding_param) # training process for train_iter, (features, labels) in enumerate(zip(X, Y)): with tf.GradientTape() as tape: for layer in embed_layers: layer.set_tape(tape) outputs = model.call(features) loss = loss_fn(outputs, labels) # Need to get non-embedding variables inside for loop because model # creates variables after the first time `model.call` is called if not train_iter: non_embed_vars = get_non_embedding_trainable_vars( model, embed_layers) embed_items = [] for layer in embed_layers: embed_items.extend([(bet, layer.name, ids) for bet, ids in layer.embedding_and_ids]) grads = tape.gradient( loss, non_embed_vars + [var for var, _, _ in embed_items]) # TODO: do not need to merge gradient from the same embedding layer # after `optimizer_wrapper` support grads_and_vars with duplicated # layer name non_embed_vars_n = len(non_embed_vars) non_embed_grads = grads[:non_embed_vars_n] embed_grads_dict = {} for (_, layer_name, ids), grad in zip(embed_items, grads[non_embed_vars_n:]): if layer_name in embed_grads_dict: merged_grads = embed_grads_dict[layer_name] embed_grads_dict[layer_name] = tf.IndexedSlices( tf.concat([merged_grads.values, grad.values], axis=0), tf.concat([merged_grads.indices, ids], axis=0), ) else: embed_grads_dict[layer_name] = tf.IndexedSlices( grad.values, ids) opt_wrapper.apply_gradients( list(zip(non_embed_grads, non_embed_vars)) + [(grad, layer_name) for layer_name, grad in embed_grads_dict.items()]) for layer in embed_layers: layer.reset()
def _test_async_correctness( self, grads_and_vars_batches, embed_values, expected_non_embed_values, expected_embed_values=None, ): """Checks the correctness of async OptimizerWrapper. This function creates many threads and these threads call `OptimizerWrapper.apply_gradients` simultaneously. Args: grads_and_vars_batches: A python list of `grads_and_vars`. Every thread takes a `grads_and_vars` and calls `apply_gradients`. embed_values: A python dictionary of `(layer_name, embedding table)`. expected_non_embed_values: A python list of expected non-embdding values after applying gradients. expected_embed_values: A python dictionary of expected embedding values after applying gradients. None means no need to check embedding values. """ thread_num = len(grads_and_vars_batches) input_dims = {} embed_var_n = len(embed_values) params = Parameters() for layer, values in embed_values.items(): embed_dim = values.shape[1] input_dims[layer] = values.shape[0] embed_table = EmbeddingTable(layer, embed_dim) embed_table.set(range(input_dims[layer]), values) params.embedding_params[layer] = embed_table opt = SGD(0.1) opt_wrapper = OptimizerWrapper( opt, True, lookup_embedding_func=params.get_embedding_param, update_embedding_func=params.set_embedding_param, ) # call optimizer_wrapper.apply_gradients asynchronously def _apply_gradients(opt_wrapper, grads_and_vars): # sleep 1s to wait that all threads are in this method call time.sleep(1) opt_wrapper.apply_gradients(grads_and_vars) executor = ThreadPoolExecutor(max_workers=thread_num) tasks = [ executor.submit(_apply_gradients, opt_wrapper, grads_and_vars) for grads_and_vars in grads_and_vars_batches ] _ = [task.result() for task in tasks] # check updated results of non-embedding variables non_embed_vars = [ var for grad, var in grads_and_vars_batches[0][:-embed_var_n] ] for var, expected_value in zip(non_embed_vars, expected_non_embed_values): self.assertTrue(np.isclose(var.numpy(), expected_value).all()) # `expected_embed_values=None` means that no need to check # embedding table if not expected_embed_values: return # check updated results of embedding table for layer, expected_values in expected_embed_values.items(): value = params.get_embedding_param(layer, range(input_dims[layer])) self.assertTrue( any([ np.isclose(value, expected).all() for expected in expected_values ]))
def _compare_slot_names(self, opt, expected): tmp = OptimizerWrapper(opt) self.assertTrue(sorted(tmp.allowed_slot_names) == sorted(expected))
class PserverServicer(elasticdl_pb2_grpc.PserverServicer): """PS service implementation""" def __init__( self, parameters, grads_to_wait, optimizer, lr_staleness_modulation=False, sync_version_tolerance=0, use_async=False, evaluation_steps=0, master_channel=None, checkpoint_saver=None, ps_id=None, num_ps_pods=None, ): if master_channel is None: self._master_stub = None else: self._master_stub = elasticdl_pb2_grpc.MasterStub(master_channel) self._parameters = parameters self._grads_to_wait = grads_to_wait self._optimizer = optimizer self._lr_staleness_modulation = lr_staleness_modulation self._sync_version_tolerance = sync_version_tolerance self._use_async = use_async self._eval_steps = evaluation_steps self._checkpoint_saver = checkpoint_saver self._ps_id = ps_id self._num_ps_pods = num_ps_pods self._version_lock = threading.Lock() self._lock = threading.Lock() self._use_wrap_opt = False self._grads_n = 0 self._grads_buffer = {} def pull_dense_parameters(self, request, _): """ Response with all non-embedding parameters if initialized. """ res = elasticdl_pb2.PullDenseParametersResponse() if not self._parameters.initialized: res.initialized = False return res # Only sync-SGD needs lock # TODO: use a read-write lock to support multiple concurrent reads if not self._use_async: self._lock.acquire() res.version = self._parameters.version # No need to send variables if the requester has the latest version. if self._parameters.version > request.version: for name, var in self._parameters.non_embedding_params.items(): serialize_ndarray(var.numpy(), res.dense_parameters[name]) if not self._use_async: self._lock.release() res.initialized = True return res def pull_embedding_vectors(self, request, _): result = tensor_pb2.TensorProto() if not request.ids: return result embedding_vectors = self._parameters.get_embedding_param( request.name, request.ids) serialize_ndarray(embedding_vectors, result) return result def push_model(self, request, _): with self._lock: accepted = self._parameters.init_from_model_pb(request) if accepted and self._parameters.has_embedding_params(): self.wrap_optimizer_and_set_slot() return empty_pb2.Empty() def push_embedding_table_infos(self, request, _): with self._lock: self._parameters.init_embedding_params( request.embedding_table_infos) self.wrap_optimizer_and_set_slot() return empty_pb2.Empty() def push_gradients(self, request, _): res = elasticdl_pb2.PushGradientsResponse() if self._use_async: grad_vars = [] for name, pb in request.gradients.dense_parameters.items(): grad = pb_to_ndarray(pb) self._parameters.check_grad(Tensor(name, grad, None)) grad = tf.constant(grad) var = self._parameters.get_non_embedding_param(name) grad_vars.append((grad, var)) for name, pb in request.gradients.embedding_tables.items(): grad = pb_to_indexed_slices(pb) self._parameters.check_grad( Tensor(name, grad.values, grad.indices)) if name in self._parameters.non_embedding_params: var = self._parameters.get_non_embedding_param(name) grad_vars.append((grad, var)) else: grad_vars.append((grad, name)) learning_rate = request.learning_rate # TODO: if request.learning_rate == 0.0, modulate learning_rate # in self._optimizer with staleness if self._lr_staleness_modulation and learning_rate > 0.0: staleness = max( 1, self._parameters.version - request.gradients.version) # Modulate learning rate by staleness learning_rate /= staleness self._set_optimizer_learning_rate(learning_rate) self._optimizer.apply_gradients(grad_vars) with self._version_lock: self._parameters.version += 1 self._save_params_to_checkpoint_if_needed() version = self._parameters.version self._report_version_if_needed(version) res.accepted = True res.version = self._parameters.version return res else: if (request.gradients.version < self._parameters.version - self._sync_version_tolerance): res.accepted = False res.version = self._parameters.version return res with self._lock: for name, pb in request.gradients.dense_parameters.items(): grad = pb_to_ndarray(pb) self._parameters.check_grad(Tensor(name, grad, None)) if name in self._grads_buffer: self._grads_buffer[name] = (self._grads_buffer[name] + grad) else: self._grads_buffer[name] = grad for name, pb in request.gradients.embedding_tables.items(): grad = pb_to_indexed_slices(pb) self._parameters.check_grad( Tensor(name, grad.values, grad.indices)) if name in self._grads_buffer: self._grads_buffer[name] = merge_indexed_slices( self._grads_buffer[name], grad) else: self._grads_buffer[name] = grad self._grads_n += 1 res.accepted = True updated_version = False version = self._parameters.version if self._grads_n == self._grads_to_wait: grad_vars = [] for name, grad in self._grads_buffer.items(): # Dense gradients are averaged, # while sparse gradients are summed if not isinstance(grad, tf.IndexedSlices): grad = grad / self._grads_to_wait grad = tf.constant(grad) var = self._parameters.get_non_embedding_param(name) if var is None: grad_vars.append((grad, name)) else: grad_vars.append((grad, var)) self._set_optimizer_learning_rate(request.learning_rate) self._optimizer.apply_gradients(grad_vars) self._grads_n = 0 self._grads_buffer.clear() self._parameters.version += 1 self._save_params_to_checkpoint_if_needed() version = self._parameters.version updated_version = True if updated_version: self._report_version_if_needed(version) res.version = version return res def wrap_optimizer(self): self._optimizer = OptimizerWrapper( self._optimizer, self._use_async, self._parameters.get_embedding_param, self._parameters.set_embedding_param, ) def _report_version_if_needed(self, version): if self._eval_steps and version % self._eval_steps == 0: self._report_version(version) def _report_version(self, version): req = elasticdl_pb2.ReportVersionRequest() req.model_version = version self._master_stub.report_version(req) def wrap_optimizer_and_set_slot(self): if not self._use_wrap_opt: self.wrap_optimizer() self._parameters.create_slot_params( self._optimizer.allowed_slot_names, self._optimizer.slot_initial_value, ) self._use_wrap_opt = True def _save_params_to_checkpoint_if_needed(self): """Save a checkpoint of parameters to a protobuf file""" if (self._checkpoint_saver and self._parameters.version % self._checkpoint_saver._steps == 0): model_pb = self._parameters.to_model_pb() logger.info("Save checkpoint for version %s" % model_pb.version) self._checkpoint_saver.save( model_pb.version, model_pb, is_eval_checkpoint=False, shard_index=self._ps_id, shard_num=self._num_ps_pods, ) def _set_optimizer_learning_rate(self, learning_rate): if learning_rate == 0.0: return if self._use_wrap_opt: self._optimizer.set_learning_rate(learning_rate) else: K.set_value(self._optimizer.lr, K.get_value(learning_rate))