def report_evaluation_metrics(self, model_outputs_pb, labels): labels = pb_to_ndarray(labels) model_outputs = {} for name, tensor_pb in model_outputs_pb.items(): model_outputs[name] = pb_to_ndarray(tensor_pb) self.evaluation_metrics.update_evaluation_metrics( model_outputs, labels)
def pull_embedding_vectors(self, layer_name, embedding_ids): """Pulls and returns embedding vectors ordered by the embedding ids.""" ps_ids = {} ps_ids_index = {} for idx, embedding_id in enumerate(embedding_ids): ps_id = int_to_id(embedding_id, self._ps_num) ps_ids.setdefault(ps_id, []).append(embedding_id) ps_ids_index.setdefault(ps_id, []).append(idx) embeddings = [] index = [] pb_future_and_id_pairs = [] for ps_id, embedding_ids in ps_ids.items(): req = elasticdl_pb2.PullEmbeddingVectorRequest() req.name = layer_name req.ids.extend(embedding_ids) pb_future = self._ps_stubs[ps_id].pull_embedding_vectors.future( req ) pb_future_and_id_pairs.append((pb_future, ps_id)) for pb_future, ps_id in pb_future_and_id_pairs: pb = pb_future.result() embeddings.append(pb_to_ndarray(pb)) index.extend(ps_ids_index[ps_id]) embeddings = np.concatenate(embeddings) # adjust the order of embedding vectors new_embeddings = np.empty_like(embeddings) new_embeddings[index] = embeddings return new_embeddings
def get_model(self): self._timing.start_record_time("get_model") if self._distribution_strategy != DistributionStrategy.ALLREDUCE: variable_future_and_id_pairs = [] if self._use_multi_ps: self.init_ps_var_partition() for ps_id, stub in enumerate(self._ps_stubs): if ps_id not in self._ps_vars: continue # async grpc call req = elasticdl_pb2.PullDenseParametersRequest() req.version = self._model_versions_from_ps[ps_id] var_future = stub.pull_dense_parameters.future(req) variable_future_and_id_pairs.append((var_future, ps_id)) for var_future, ps_id in variable_future_and_id_pairs: res = var_future.result() if not res.initialized: # push variable to ps for initialization self.report_variable_to_ps(ps_id) req = elasticdl_pb2.PullDenseParametersRequest() req.version = self._model_versions_from_ps[ps_id] res = self._ps_stubs[ps_id].pull_dense_parameters(req) if not res.initialized: # TODO: support PS fault-tolerance raise RuntimeError( "PS pod %d cannot be initialized" % ps_id ) for name, pb in res.dense_parameters.items(): self._non_embed_vars[name].assign(pb_to_ndarray(pb)) self._model_versions_from_ps[ps_id] = res.version self._model_version = max(self._model_versions_from_ps) self._timing.end_record_time("get_model")
def test_pull_dense_parameters(self): self.create_default_server_and_stub() param0 = { "v0": np.random.rand(3, 2).astype(np.float32), "v1": np.random.rand(10, 32).astype(np.float32), } pull_req = elasticdl_pb2.PullDenseParametersRequest() pull_req.version = -1 # try to pull variable res = self._stub.pull_dense_parameters(pull_req) # not initialized self.assertFalse(res.initialized) # init variable req = elasticdl_pb2.Model() req.version = 1 for name, var in param0.items(): serialize_ndarray(var, req.dense_parameters[name]) res = self._stub.push_model(req) self.assertEqual(res, empty_pb2.Empty()) # pull variable back res = self._stub.pull_dense_parameters(pull_req) self.assertTrue(res.initialized) self.assertEqual(res.version, req.version) for name, pb in res.dense_parameters.items(): tensor = pb_to_ndarray(pb) self.assertTrue(np.allclose(param0[name], tensor)) # pull variable again, no param as no updated version pull_req.version = res.version res = self._stub.pull_dense_parameters(pull_req) self.assertTrue(res.initialized) self.assertEqual(res.version, pull_req.version) self.assertTrue(not res.dense_parameters)
def init_from_model_pb(self, model_pb): """Initializes `Parameters` with model protocol buffer. The `Parameters` accepts model pb and initialize only when it is not initialized. Otherwise, it ignores the model pb. Args: model_pb: The model protocol buffer used for initialization. Returns: A bool indicates whether `Parameters` accepts this model pb or not. """ if not self.initialized: infos = model_pb.embedding_table_infos self.init_embedding_params(infos) for name, pb in model_pb.dense_parameters.items(): # Please note that `tf.Variable` will do something with magic. # If you pass a name "somename" to a `tf.Variable`, the final # variable name will be "somename:0". So the `tf.Variable.name` # is meaningless, we must avoid use it in PS side. arr = pb_to_ndarray(pb) var = tf.Variable(initial_value=arr, trainable=True) self.non_embedding_params[name] = var for name, pb in model_pb.embedding_tables.items(): s = pb_to_indexed_slices(pb) self.embedding_params[name].set(s.indices, s.values) self.version = max(0, model_pb.version) self.initialized = True return True return False
def pull_dense_parameters(self, ps_ids, model_versions): """ Pull dense parameters from PS. """ variable_future_and_id_pairs = [] for ps_id in ps_ids: if ps_id not in self.ps_to_parameter: continue stub = self.ps_stubs[ps_id] # async grpc call req = elasticdl_pb2.PullDenseParametersRequest() req.version = model_versions[ps_id] var_future = stub.pull_dense_parameters.future(req) variable_future_and_id_pairs.append((var_future, ps_id)) dense_params = {} uninit_ps = [] for var_future, ps_id in variable_future_and_id_pairs: res = var_future.result() if not res.initialized: uninit_ps.append(ps_id) else: for name, pb in res.dense_parameters.items(): dense_params[name] = pb_to_ndarray(pb) model_versions[ps_id] = res.version return dense_params, uninit_ps
def _get_params_shard_from_pb(model_pb, shard_index, shard_num): """Get parameters including variables values and embedding table from a model protobuf. Args: model_pb: A Model protobuf instance. shard_index: Model shard index. shard_num: The total number of model shards. Return: non_embedding_vars: A Python dict in which the key is a variable name and the value is a `tf.Variable` object. embedding_table_values: A Python dict in which the key is an embedding table name and the value is a tuple with 2 elements. The value[0] is indices and value[1] is the corresponding embedding vector. """ non_embedding_vars = {} embedding_table_values = {} for name, pb in model_pb.dense_parameters.items(): if string_to_id(name, shard_num) == shard_index: non_embedding_vars[name] = tf.Variable( initial_value=pb_to_ndarray(pb), trainable=True) for name, pb in model_pb.embedding_tables.items(): embedding_table_values.setdefault(name, ([], [])) t = pb_to_indexed_slices(pb) for embedding_id, vector in zip(t.indices, t.values): if int_to_id(embedding_id, shard_num) == shard_index: embedding_table_values[name][0].append(embedding_id) embedding_table_values[name][1].append(vector) return non_embedding_vars, embedding_table_values
def get_embedding_vectors(self, name, ids): pull_req = elasticdl_pb2.PullEmbeddingVectorRequest() pull_req.name = name pull_req.ids.extend(ids) res = self._stub.pull_embedding_vectors(pull_req) if res.tensor_content: return pb_to_ndarray(res) else: return None
def verify(array): pb = ndarray_to_pb(array) new_array = pb_to_ndarray(pb) np.testing.assert_array_equal(array, new_array)
def push_gradients(self, request, _): res = elasticdl_pb2.PushGradientsResponse() if self._use_async: grad_vars = [] for name, pb in request.gradients.dense_parameters.items(): grad = pb_to_ndarray(pb) self._parameters.check_grad(Tensor(name, grad, None)) grad = tf.constant(grad) var = self._parameters.get_non_embedding_param(name) grad_vars.append((grad, var)) for name, pb in request.gradients.embedding_tables.items(): grad = pb_to_indexed_slices(pb) self._parameters.check_grad( Tensor(name, grad.values, grad.indices)) if name in self._parameters.non_embedding_params: var = self._parameters.get_non_embedding_param(name) grad_vars.append((grad, var)) else: grad_vars.append((grad, name)) learning_rate = request.learning_rate # TODO: if request.learning_rate == 0.0, modulate learning_rate # in self._optimizer with staleness if self._lr_staleness_modulation and learning_rate > 0.0: staleness = max( 1, self._parameters.version - request.gradients.version) # Modulate learning rate by staleness learning_rate /= staleness self._set_optimizer_learning_rate(learning_rate) self._optimizer.apply_gradients(grad_vars) with self._version_lock: self._parameters.version += 1 self._save_params_to_checkpoint_if_needed() version = self._parameters.version self._report_version_if_needed(version) res.accepted = True res.version = self._parameters.version return res else: if (request.gradients.version < self._parameters.version - self._sync_version_tolerance): res.accepted = False res.version = self._parameters.version return res with self._lock: for name, pb in request.gradients.dense_parameters.items(): grad = pb_to_ndarray(pb) self._parameters.check_grad(Tensor(name, grad, None)) if name in self._grads_buffer: self._grads_buffer[name] = (self._grads_buffer[name] + grad) else: self._grads_buffer[name] = grad for name, pb in request.gradients.embedding_tables.items(): grad = pb_to_indexed_slices(pb) self._parameters.check_grad( Tensor(name, grad.values, grad.indices)) if name in self._grads_buffer: self._grads_buffer[name] = merge_indexed_slices( self._grads_buffer[name], grad) else: self._grads_buffer[name] = grad self._grads_n += 1 res.accepted = True updated_version = False version = self._parameters.version if self._grads_n == self._grads_to_wait: grad_vars = [] for name, grad in self._grads_buffer.items(): # Dense gradients are averaged, # while sparse gradients are summed if not isinstance(grad, tf.IndexedSlices): grad = grad / self._grads_to_wait grad = tf.constant(grad) var = self._parameters.get_non_embedding_param(name) if var is None: grad_vars.append((grad, name)) else: grad_vars.append((grad, var)) self._set_optimizer_learning_rate(request.learning_rate) self._optimizer.apply_gradients(grad_vars) self._grads_n = 0 self._grads_buffer.clear() self._parameters.version += 1 self._save_params_to_checkpoint_if_needed() version = self._parameters.version updated_version = True if updated_version: self._report_version_if_needed(version) res.version = version return res