def report_embedding_info(self): model = elasticdl_pb2.Model() if self._embedding_layers: embedding_infos = model.embedding_table_infos for layer in self._embedding_layers: embedding_info = embedding_infos.add() embedding_info.name = layer.embedding_weight_name embedding_info.dim = layer.output_dim embedding_info.initializer = layer.embeddings_initializer # set to float32 embedding_info.dtype = dtype_numpy_to_tensor( np.dtype("float32") ) if self._embedding_columns: embedding_infos = model.embedding_table_infos for column in self._embedding_columns: embedding_info = embedding_infos.add() embedding_info.name = column.embedding_weight_name embedding_info.dim = column.dimension # TODO(brightcoder01): The initializer in embedding column is # a variable initializer function. For embedding layer, it's a # tf.keras.initializers. Keep aligned between these two. embedding_info.initializer = "uniform" # set to float32 embedding_info.dtype = dtype_numpy_to_tensor( np.dtype("float32") ) for ps_id in range(self._ps_num): self._ps_stubs[ps_id].push_embedding_table_infos(model)
def GetModel(self, request, _): if not self._use_async: self._validate_model_version(request.version) if ( request.method == elasticdl_pb2.MINIMUM or request.version == self._version ): if self._use_async: res = self._get_model_no_lock() else: with self._lock: res = self._get_model_no_lock() return res # Read from checkpoint for the fixed version model pb_model = elasticdl_pb2.Model() try: pb_model = self._checkpoint_service.get_checkpoint_model( request.version ) except Exception: logger.error( "Failed to fetch checkpoint model for " "model version {}".format(request.version) ) return pb_model
def load_from_checkpoint_file(file_name): from elasticdl.proto import elasticdl_pb2 pb_model = elasticdl_pb2.Model() with open(file_name, "rb") as f: pb_model.ParseFromString(f.read()) return pb_model
def test_pull_variable(self): self.create_default_server_and_stub() param0 = { "v0": np.random.rand(3, 2).astype(np.float32), "v1": np.random.rand(10, 32).astype(np.float32), } pull_req = empty_pb2.Empty() # try to pull variable res = self._stub.pull_variable(pull_req) # not initialized self.assertFalse(res.model_init_status) # init variable req = elasticdl_pb2.Model() req.version = 1 for name, var in param0.items(): emplace_tensor_pb_from_ndarray(req.param, var, name=name) res = self._stub.push_model(req) self.assertEqual(res, empty_pb2.Empty()) # pull variable back res = self._stub.pull_variable(pull_req) self.assertTrue(res.model_init_status) self.assertEqual(res.model.version, req.version) for param in res.model.param: name = param.name tensor = tensor_pb_to_ndarray(param) self.assertTrue(np.allclose(param0[name], tensor))
def test_pull_dense_parameters(self): self.create_default_server_and_stub() param0 = { "v0": np.random.rand(3, 2).astype(np.float32), "v1": np.random.rand(10, 32).astype(np.float32), } pull_req = elasticdl_pb2.PullDenseParametersRequest() pull_req.version = -1 # try to pull variable res = self._stub.pull_dense_parameters(pull_req) # not initialized self.assertFalse(res.initialized) # init variable req = elasticdl_pb2.Model() req.version = 1 for name, var in param0.items(): serialize_ndarray(var, req.dense_parameters[name]) res = self._stub.push_model(req) self.assertEqual(res, empty_pb2.Empty()) # pull variable back res = self._stub.pull_dense_parameters(pull_req) self.assertTrue(res.initialized) self.assertEqual(res.version, req.version) for name, pb in res.dense_parameters.items(): tensor = pb_to_ndarray(pb) self.assertTrue(np.allclose(param0[name], tensor)) # pull variable again, no param as no updated version pull_req.version = res.version res = self._stub.pull_dense_parameters(pull_req) self.assertTrue(res.initialized) self.assertEqual(res.version, pull_req.version) self.assertTrue(not res.dense_parameters)
def report_variable_to_ps(self, ps_id): model = elasticdl_pb2.Model() model.version = self._model_versions_from_ps[ps_id] if ps_id in self._ps_vars: vars = self._ps_vars[ps_id] for var in vars: serialize_ndarray(var.numpy(), model.dense_parameters[var.name]) self._ps_stubs[ps_id].push_model(model)
def report_variable_to_ps(self, ps_id): model = elasticdl_pb2.Model() if ps_id in self._ps_vars: vars = self._ps_vars[ps_id] for var in vars: emplace_tensor_pb_from_ndarray(model.param, var.numpy(), name=var.name) self._ps_stubs[ps_id].push_model(model)
def get_version_from_checkpoint(checkpoint_dir): """Get model version from the checkpoint. There may be several shard files in the checkpoint directory. The model versions of shard files are same, so we only need to read one shard file to get model version. """ variable_shard_files = os.listdir(checkpoint_dir) shard_file_path = os.path.join(checkpoint_dir, variable_shard_files[0]) model_pb = elasticdl_pb2.Model() model_pb = load_pb_from_file(model_pb, shard_file_path) return model_pb.version
def report_embedding_info(self): model = elasticdl_pb2.Model() if self._embedding_layers: embedding_infos = model.embedding_table_info for layer in self._embedding_layers: embedding_info = embedding_infos.add() embedding_info.name = layer.name embedding_info.dim = layer.output_dim embedding_info.initializer = layer.embeddings_initializer for ps_id in range(len(self._ps_stubs)): self._ps_stubs[ps_id].push_embedding_info(model)
def restore_params_from_checkpoint(checkpoint_dir, shard_index, shard_num): """Restore a shard parameters from the checkpoint directory. If shard_num=1, a entire model parameters will be restored. Args: checkpoint_dir: a directory with checkpoint files. shard_index: Model shard index, e.g. the PS instance index using ParameterServerStrategy with multiple PS instances. shard_num: The total number of model shards, e.g. the total PS instancecount using ParameterServerStrategy with multiple PS instances. Return: parameters: A Parameter object which contains model version, non-embedding parameters and embedding tables for the PS instance with ps_id. """ variable_shard_files = os.listdir(checkpoint_dir) non_embedding_vars = {} embedding_tables = {} version = None for shard_file in variable_shard_files: shard_file_path = os.path.join(checkpoint_dir, shard_file) model_pb = elasticdl_pb2.Model() model_pb = load_pb_from_file(model_pb, shard_file_path) if version is None: version = model_pb.version elif version != model_pb.version: raise ValueError( "The versions in model shards are not consistent" ) for embedding_info_pb in model_pb.embedding_table_infos: embedding_table = create_embedding_table(embedding_info_pb) embedding_tables.setdefault( embedding_table.name, embedding_table ) ( shard_non_embedding_vars, shard_embedding_table_values, ) = _get_params_shard_from_pb(model_pb, shard_index, shard_num) non_embedding_vars.update(shard_non_embedding_vars) for name, pair in shard_embedding_table_values.items(): embedding_tables[name].set(pair[0], pair[1]) parameters = Parameters() parameters.non_embedding_params.update(non_embedding_vars) parameters.embedding_params.update(embedding_tables) parameters.version = version return parameters
def push_embedding_table_infos(self, infos): model = elasticdl_pb2.Model() embedding_infos = model.embedding_table_infos for info in infos: embedding_info = embedding_infos.add() embedding_info.name = info.name embedding_info.dim = info.dim embedding_info.initializer = info.initializer embedding_info.dtype = info.dtype for ps_id in range(self.ps_num): self.ps_stubs[ps_id].push_embedding_table_infos(model)
def push_dense_parameters(self, parameters, ps_id, version): """ Push dense parameters to PS Args: parameters: a list of Tensors ps_id: PS id version: model version """ model = elasticdl_pb2.Model() model.version = version for p in parameters: if self.parameter_to_ps[p.name] == ps_id: serialize_ndarray(p.values, model.dense_parameters[p.name]) self.ps_stubs[ps_id].push_model(model)
def test_emplace_tensor_pb_from_ndarray(self): values = np.array([[0.1, 0.2, 0.3], [0.4, 0.5, 0.6]], np.float32) indices = np.array([0, 2]) name = "test" model = elasticdl_pb2.Model() emplace_tensor_pb_from_ndarray(model.param, values, indices, name) pb = model.param[-1] print("pb", pb) expected_pb = Tensor(values, indices, name).to_tensor_pb() self.assertEqual(pb.name, expected_pb.name) self.assertEqual(pb.dim, expected_pb.dim) self.assertEqual(pb.content, expected_pb.content) self.assertEqual(pb.indices, expected_pb.indices) self.assertEqual(pb.dtype, expected_pb.dtype)
def push_gradient_test_setup(self): self.var_names = ["test_1", "test_2"] self.var_values = [ np.array([10.0, 20.0, 30.0], np.float32), np.array([20.0, 40.0, 60.0], np.float32), ] self.grad_values0 = [ np.array([1.0, 2.0, 3.0], np.float32), np.array([2.0, 4.0, 6.0], np.float32), ] self.grad_values1 = [ np.array([0.0, 0.0, 7.0], np.float32), np.array([9.0, 9.0, 6.0], np.float32), ] dim = self._embedding_info.dim self.embedding_table = ( np.random.rand(4 * dim).reshape((4, dim)).astype(np.float32) ) self.embedding_grads0 = tf.IndexedSlices( values=np.random.rand(3 * dim) .reshape((3, dim)) .astype(np.float32), indices=(3, 1, 3), ) self.embedding_grads1 = tf.IndexedSlices( values=np.random.rand(3 * dim) .reshape((3, dim)) .astype(np.float32), indices=(2, 2, 3), ) push_model_req = elasticdl_pb2.Model() push_model_req.version = self._parameters.version for name, value in zip(self.var_names, self.var_values): emplace_tensor_pb_from_ndarray( push_model_req.param, value, name=name ) push_model_req.embedding_table_info.append(self._embedding_info) self._stub.push_model(push_model_req) for name, var in zip(self.var_names, self.var_values): self._parameters.non_embedding_params[name] = tf.Variable(var) self._parameters.embedding_params[self._embedding_info.name].set( range(len(self.embedding_table)), self.embedding_table )
def test_pull_embedding_vectors(self): self.create_default_server_and_stub() id_list_0 = [1, 3, 9, 6] id_list_1 = [8, 9, 1, 0, 6] req = elasticdl_pb2.Model() req.version = 1 req.embedding_table_infos.append(self._embedding_info) another_embedding_info = elasticdl_pb2.EmbeddingTableInfo() another_embedding_info.name = "layer_b" another_embedding_info.dim = 16 another_embedding_info.initializer = "normal" req.embedding_table_infos.append(another_embedding_info) res = self._stub.push_model(req) self.assertEqual(res, empty_pb2.Empty()) vectors_a_0 = self.get_embedding_vectors("layer_a", id_list_0) self.assertEqual(vectors_a_0.shape[0], len(id_list_0)) self.assertEqual(vectors_a_0.shape[1], 32) vectors_a_1 = self.get_embedding_vectors("layer_a", id_list_1) self.assertEqual(vectors_a_1.shape[0], len(id_list_1)) self.assertEqual(vectors_a_1.shape[1], 32) vectors_b_1 = self.get_embedding_vectors("layer_b", id_list_1) self.assertEqual(vectors_b_1.shape[0], len(id_list_1)) self.assertEqual(vectors_b_1.shape[1], 16) vectors_b_0 = self.get_embedding_vectors("layer_b", id_list_0) self.assertEqual(vectors_b_0.shape[0], len(id_list_0)) self.assertEqual(vectors_b_0.shape[1], 16) for idx0, id0 in enumerate(id_list_0): for idx1, id1 in enumerate(id_list_1): if id0 == id1: self.assertTrue( np.array_equal(vectors_a_0[idx0], vectors_a_1[idx1]) ) self.assertTrue( np.array_equal(vectors_b_0[idx0], vectors_b_1[idx1]) ) vectors = self.get_embedding_vectors("layer_a", []) self.assertEqual(vectors, None)
def push_gradient_test_setup(self): self.var_names = ["test_1", "test_2"] self.var_values = [ np.array([10.0, 20.0, 30.0], np.float32), np.array([20.0, 40.0, 60.0], np.float32), ] self.grad_values0 = [ np.array([1.0, 2.0, 3.0], np.float32), np.array([2.0, 4.0, 6.0], np.float32), ] self.grad_values1 = [ np.array([0.0, 0.0, 7.0], np.float32), np.array([9.0, 9.0, 6.0], np.float32), ] dim = self._embedding_info.dim self.embedding_table = ( np.random.rand(4 * dim).reshape((4, dim)).astype(np.float32) ) self.embedding_grads0 = Tensor( None, np.random.rand(3 * dim).reshape((3, dim)).astype(np.float32), np.asarray([3, 1, 3]), ) self.embedding_grads1 = Tensor( None, np.random.rand(3 * dim).reshape((3, dim)).astype(np.float32), np.asarray([2, 2, 3]), ) push_model_req = elasticdl_pb2.Model() push_model_req.version = self._parameters.version for name, value in zip(self.var_names, self.var_values): serialize_ndarray(value, push_model_req.dense_parameters[name]) push_model_req.embedding_table_infos.append(self._embedding_info) self._stub.push_model(push_model_req) for name, var in zip(self.var_names, self.var_values): self._parameters.non_embedding_params[name] = tf.Variable(var) self._parameters.embedding_params[self._embedding_info.name].set( range(len(self.embedding_table)), self.embedding_table )
def to_model_pb(self): """ Convert all parameters including embedding and non-embedding parameters to `elasticdl_pb2.Model` which can be serialized. """ model_pb = elasticdl_pb2.Model() model_pb.version = self.version for name, var in self.non_embedding_params.items(): serialize_ndarray(var.numpy(), model_pb.dense_parameters[name]) for name, embedding_table in self.embedding_params.items(): # Slot embedding table is not weights in the model, so we don't # save it to checkpoint. if not embedding_table.is_slot: serialize_indexed_slices( embedding_table.to_indexed_slices(), model_pb.embedding_tables[name], ) embedding_info = embedding_table.to_embedding_table_info_pb() model_pb.embedding_table_infos.append(embedding_info) return model_pb
def to_model_pb(self): """ Convert all parameters including embedding and non-embedding parameters to `elasticdl_pb2.Model` which can be serialized. """ model_pb = elasticdl_pb2.Model() model_pb.version = self.version for name, var in self.non_embedding_params.items(): emplace_tensor_pb_from_ndarray( model_pb.param, var.numpy(), name=name ) for name, embedding_table in self.embedding_params.items(): embedding_table_tensor = embedding_table.to_tensor() tensor_pb = model_pb.param.add() serialize_tensor(embedding_table_tensor, tensor_pb) embedding_info = embedding_table.to_embedding_table_info_pb() model_pb.embedding_table_info.append(embedding_info) return model_pb
def report_embedding_info(self): model = elasticdl_pb2.Model() if self._embedding_layers: embedding_infos = model.embedding_table_info for layer in self._embedding_layers: embedding_info = embedding_infos.add() embedding_info.name = layer.name embedding_info.dim = layer.output_dim embedding_info.initializer = layer.embeddings_initializer if self._embedding_columns: embedding_infos = model.embedding_table_info for column in self._embedding_columns: embedding_info = embedding_infos.add() embedding_info.name = column.name embedding_info.dim = column.dimension # TODO(brightcoder01): The initializer in embedding column is # a variable initializer function. For embedding layer, it's a # tf.keras.initializers. Keep aligned between these two. embedding_info.initializer = "uniform" for ps_id in range(len(self._ps_stubs)): self._ps_stubs[ps_id].push_embedding_info(model)
def _get_model_no_lock(self): pb_model = elasticdl_pb2.Model() pb_model.version = self._version for k, v in self._model.items(): pb_model.param[k].CopyFrom(ndarray_to_tensor(v.numpy())) return pb_model
def _get_model_no_lock(self): pb_model = elasticdl_pb2.Model() pb_model.version = self._version for k, v in self._model.items(): emplace_tensor_pb_from_ndarray(pb_model.param, v.numpy(), name=k) return pb_model
def test_push_model(self): opt_func_name = "ftrl_optimizer" opt = load_module(_module_file).__dict__[opt_func_name]() opt_config = opt.get_config() slot_names = ["accumulator", "linear"] slot_init_value = { "accumulator": opt_config["initial_accumulator_value"], "linear": 0.0, } self.create_default_server_and_stub(optimizer=opt_func_name) param0 = { "v0": np.random.rand(3, 2).astype(np.float32), "v1": np.random.rand(10, 32).astype(np.float32), } param1 = { "v0": np.ones([3, 2], dtype=np.float32), "v1": np.ones([10, 32], dtype=np.float32), } models = [param0, param1] for idx, model in enumerate(models): req = elasticdl_pb2.Model() req.version = idx + 1 for name in model: serialize_ndarray(model[name], req.dense_parameters[name]) req.embedding_table_infos.append(self._embedding_info) res = self._stub.push_model(req) self.assertEqual(res, empty_pb2.Empty()) # self._parameters is initialized with the first push_model call # and the second push_model has no effect self.assertEqual(self._parameters.version, 1) for name in param0: self.assertTrue( np.allclose( param0[name], self._parameters.non_embedding_params[name].numpy(), ) ) self.assertEqual( self._embedding_info.name, self._parameters.embedding_params[ self._embedding_info.name ].name, ) self.assertEqual( self._embedding_info.dim, self._parameters.embedding_params[ self._embedding_info.name ].dim, ) self.assertEqual( tf.keras.initializers.get( self._embedding_info.initializer ).__class__, self._parameters.embedding_params[ self._embedding_info.name ].initializer.__class__, ) for slot_name in slot_names: name = get_slot_table_name( self._embedding_info.name, slot_name ) table = self._parameters.embedding_params[name] self.assertTrue(name, table.name) self.assertTrue(self._embedding_info.dim, table.dim) embedding = table.get([2]) self.assertTrue( (embedding - slot_init_value[slot_name] < 0.0001).all() )