def get_params_shard_from_pb(model_pb, shard_index, shard_num): """Get parameters including variables values and embedding table from a model protobuf. Args: model_pb: A Model protobuf instance. shard_index: Model shard index. shard_num: The total number of model shards. Return: non_embedding_vars: A Python dict in which the key is a variable name and the value is a `tf.Variable` object. embedding_table_values: A Python dict in which the key is an embedding table name and the value is a tuple with 2 elements. The value[0] is indices and value[1] is the corresponding embedding vector. """ non_embedding_vars = {} embedding_table_values = {} for tensor_pb in model_pb.param: tensor = Tensor.from_tensor_pb(tensor_pb) if tensor.indices is not None: embedding_table_values.setdefault(tensor.name, ([], [])) for embedding_id, vector in zip(tensor.indices, tensor.values): if int_to_id(embedding_id, shard_num) == shard_index: embedding_table_values[tensor.name][0].append(embedding_id) embedding_table_values[tensor.name][1].append(vector) else: if string_to_id(tensor.name, shard_num) == shard_index: non_embedding_vars[tensor.name] = tf.Variable( initial_value=tensor.values, trainable=True) return non_embedding_vars, embedding_table_values
def get_model(self): self._timing.start_record_time("get_model") variable_future_and_id_pairs = [] if self._use_multi_ps: self.init_ps_var_partition() for ps_id, stub in enumerate(self._ps_stubs): if ps_id not in self._ps_vars: continue # async grpc call req = elasticdl_pb2.PullVariableRequest() req.current_model_version = self._model_versions_from_ps[ps_id] var_future = stub.pull_variable.future(req) variable_future_and_id_pairs.append((var_future, ps_id)) for var_future, ps_id in variable_future_and_id_pairs: res = var_future.result() if not res.model_init_status: # push variable to ps for initialization self.report_variable_to_ps(ps_id) req = elasticdl_pb2.PullVariableRequest() req.current_model_version = self._model_versions_from_ps[ps_id] res = self._ps_stubs[ps_id].pull_variable(req) if not res.model_init_status: # TODO: support PS fault-tolerance raise RuntimeError("PS pod %d cannot be initialized" % ps_id) for tensor_pb in res.model.param: tensor = Tensor.from_tensor_pb(tensor_pb) self._non_embed_vars[tensor.name].assign(tensor.to_ndarray()) self._model_versions_from_ps[ps_id] = res.model.version self._model_version = max(self._model_versions_from_ps) self._timing.end_record_time("get_model")
def get_model(self): model_version = -1 variable_future_and_id_pairs = [] req = empty_pb2.Empty() if self._use_multi_ps: self.init_ps_var_partition() for ps_id, stub in enumerate(self._ps_stubs): if ps_id not in self._ps_vars: continue # async grpc call var_future = stub.pull_variable.future(req) variable_future_and_id_pairs.append((var_future, ps_id)) for var_future, ps_id in variable_future_and_id_pairs: res = var_future.result() if not res.model_init_status: # push variable to ps for initialization self.report_variable_to_ps(ps_id) res = self._ps_stubs[ps_id].pull_variable(req) if not res.model_init_status: # TODO: support PS fault-tolerance raise RuntimeError("PS pod %d cannot be initialized" % ps_id) for tensor_pb in res.model.param: tensor = Tensor.from_tensor_pb(tensor_pb) self._non_embed_vars[tensor.name].assign(tensor.to_ndarray()) model_version = max(model_version, res.model.version) self._model_version = model_version
def _get_non_embedding_variables(self, version, method): """Get model from master, and update model_version """ req = elasticdl_pb2.GetModelRequest() req.version = version req.method = method model = self._stub.GetModel(req, None) variables = {} for tensor_pb in model.param: tensor = Tensor.from_tensor_pb(tensor_pb) variables[tensor.name] = tensor.to_ndarray() return variables
def get_model_from_master(self, version, method): """ get model from master, and update model_version """ req = elasticdl_pb2.GetModelRequest() req.version = version req.method = method model = self._stub.GetModel(req) # Assumes all trainable variables exist in model.param. for tensor_pb in model.param: tensor = Tensor.from_tensor_pb(tensor_pb) self._non_embed_vars[tensor.name].assign(tensor.to_ndarray()) self._model_version = model.version
def test_tensor_data_structure(self): # Test tensor values, without indices arr = np.ndarray(shape=[3, 1, 2, 4], dtype=np.int32) tensor = Tensor(arr) self.assertTrue(np.array_equal(arr, tensor.values)) self.assertTrue(np.array_equal(arr, tensor.to_tf_tensor())) self.assertFalse(tensor.is_indexed_slices()) # Test tensor values, with indices indices = np.array([2, 0, 1]) tensor = Tensor(arr, indices) self.assertTrue(np.array_equal(arr, tensor.values)) self.assertTrue(np.array_equal(indices, tensor.indices)) self.assertTrue(np.array_equal(arr, tensor.to_tf_tensor().values)) self.assertTrue(np.array_equal(indices, tensor.to_tf_tensor().indices)) self.assertTrue(tensor.is_indexed_slices()) # Test round trip # tensor to tensor PB tensor = Tensor(arr, indices, name="test") pb = tensor.to_tensor_pb() self.assertEqual(pb.name, "test") self.assertEqual(pb.dim, [3, 1, 2, 4]) self.assertEqual(pb.dtype, tensor_dtype_pb2.DT_INT32) np.testing.assert_array_equal(pb.indices, indices) # tensor PB to tensor tensor_new = Tensor.from_tensor_pb(pb) self.assertEqual(tensor.name, "test") np.testing.assert_array_equal(tensor_new.values, arr) np.testing.assert_array_equal(tensor_new.indices, indices) # Test Tensor().to_ndarray() values = np.array([[1.0, 2.0], [3.0, 4.0]]) indices = np.array([0, 2]) name = "test" tensor = Tensor(values, indices, name) self.assertRaises(NotImplementedError, tensor.to_ndarray) tensor = Tensor(values, name=name) self.assertTrue(np.allclose(values, tensor.to_ndarray()))
def push_gradient(self, request, _): res = elasticdl_pb2.PushGradientResponse() if self._use_async: grad_vars = [] for pb in request.gradients: grad = Tensor.from_tensor_pb(pb) self._parameters.check_grad(grad) name = grad.name var = self._parameters.get_non_embedding_param(name) grad = grad.to_tf_tensor() if var is None: grad_vars.append((grad, name)) else: grad_vars.append((grad, var)) if self._lr_scheduler: self._lr_scheduler.set_model_version(self._parameters.version) self._optimizer.apply_gradients(grad_vars) with self._version_lock: self._parameters.version += 1 self._save_params_to_checkpoint_if_needed() version = self._parameters.version self._report_version_if_needed(version) res.accepted = True res.model_version = self._parameters.version return res else: if (request.model_version < self._parameters.version - self._sync_version_tolerance): res.accepted = False res.model_version = self._parameters.version return res with self._lock: for pb in request.gradients: grad = Tensor.from_tensor_pb(pb) self._parameters.check_grad(grad) if grad.name in self._grads_buffer: self._grads_buffer[grad.name] = ( self._grads_buffer[grad.name] + grad) else: self._grads_buffer[grad.name] = grad self._grads_n += 1 res.accepted = True updated_version = False version = self._parameters.version if self._grads_n == self._grads_to_wait: grad_vars = [] for name, grad in self._grads_buffer.items(): # Dense gradients are averaged, # while sparse gradients are summed if not grad.is_indexed_slices(): grad.values = grad.values / self._grads_to_wait var = self._parameters.get_non_embedding_param(name) grad = grad.to_tf_tensor() if var is None: grad_vars.append((grad, name)) else: grad_vars.append((grad, var)) if self._lr_scheduler: self._lr_scheduler.set_model_version( self._parameters.version) self._optimizer.apply_gradients(grad_vars) self._grads_n = 0 self._grads_buffer.clear() self._parameters.version += 1 self._save_params_to_checkpoint_if_needed() version = self._parameters.version updated_version = True if updated_version: self._report_version_if_needed(version) res.model_version = version return res
def ReportGradient(self, request, _): model_version_valid = self._use_async or self._validate_model_version( request.model_version) res = elasticdl_pb2.ReportGradientResponse() if not model_version_valid: logger.warning( "Task result for outdated version %d dropped", request.model_version, ) res.accepted = False res.model_version = self._version return res non_embedding_gradients = {} indexed_grads = {} edl_embedding_gradients = {} # Do sanity check before accumulating gradients. for v in request.gradient: tensor = Tensor.from_tensor_pb(v) name = tensor.name if name not in self._model: if tensor.is_indexed_slices(): # grads of ElasticDL Embedding layer # TODO: check arr.shape[1] = embedding_dim of this # EdlEmbedding layer edl_embedding_gradients[name] = tensor.to_tf_tensor() continue else: raise ValueError("Gradient key: %s is not part of model", name) if tensor.is_indexed_slices(): if (tensor.values.shape[1] != self._model[name].numpy().shape[1]): raise ValueError( "Gradient key: %s has incompatible " "indexed slice dimension %d, expected %d" % ( name, tensor.values.shape[1], self._model[name].numpy().shape[1], )) max_index = tf.math.reduce_max(tensor.indices).numpy() if max_index >= self._model[name].numpy().shape[0]: raise ValueError( "Gradient key: %s has wrong indices %d, " "out of range %d" % ( name, max_index, self._model[name].numpy().shape[0] - 1, )) indexed_grads[name] = tensor.to_tf_tensor() else: if tensor.values.shape != self._model[name].numpy().shape: raise ValueError( "Gradient key: %s has incompatible dimension", name) non_embedding_gradients[name] = tensor.to_tf_tensor() if not self._use_async: self._lock.acquire() self._process_gradients( edl_embedding_gradients, indexed_grads, non_embedding_gradients, request.model_version, ) if not self._use_async: self._lock.release() res.accepted = True res.model_version = self._version return res