def report_variable_to_master(self): req = elasticdl_pb2.ReportVariableRequest() for v in self._non_embed_vars.values(): emplace_tensor_pb_from_ndarray(req.variable, v.numpy(), name=v.name) self._stub.ReportVariable(req)
def test_pull_variable(self): self.create_default_server_and_stub() param0 = { "v0": np.random.rand(3, 2).astype(np.float32), "v1": np.random.rand(10, 32).astype(np.float32), } pull_req = empty_pb2.Empty() # try to pull variable res = self._stub.pull_variable(pull_req) # not initialized self.assertFalse(res.model_init_status) # init variable req = elasticdl_pb2.Model() req.version = 1 for name, var in param0.items(): emplace_tensor_pb_from_ndarray(req.param, var, name=name) res = self._stub.push_model(req) self.assertEqual(res, empty_pb2.Empty()) # pull variable back res = self._stub.pull_variable(pull_req) self.assertTrue(res.model_init_status) self.assertEqual(res.model.version, req.version) for param in res.model.param: name = param.name tensor = tensor_pb_to_ndarray(param) self.assertTrue(np.allclose(param0[name], tensor))
def report_variable_to_ps(self, ps_id): model = elasticdl_pb2.Model() if ps_id in self._ps_vars: vars = self._ps_vars[ps_id] for var in vars: emplace_tensor_pb_from_ndarray(model.param, var.numpy(), name=var.name) self._ps_stubs[ps_id].push_model(model)
def makeGrad(): """ Make a ReportGradientRequest compatible with model""" req = elasticdl_pb2.ReportGradientRequest() emplace_tensor_pb_from_ndarray(req.gradient, np.array([0.1], np.float32), name="x") emplace_tensor_pb_from_ndarray(req.gradient, np.array([0.03, 0.06], np.float32), name="y") req.model_version = 1 return req
def report_evaluation_metrics(self, model_outputs, labels): """ report evaluation metrics to ps. """ req = elasticdl_pb2.ReportEvaluationMetricsRequest() for name, output in model_outputs.items(): output = np.concatenate(output) emplace_tensor_pb_from_ndarray(req.model_outputs, output, name=name) labels = np.concatenate(labels) tensor = Tensor(values=labels) serialize_tensor(tensor, req.labels) self._stub.report_evaluation_metrics(req)
def test_emplace_tensor_pb_from_ndarray(self): values = np.array([[0.1, 0.2, 0.3], [0.4, 0.5, 0.6]], np.float32) indices = np.array([0, 2]) name = "test" model = elasticdl_pb2.Model() emplace_tensor_pb_from_ndarray(model.param, values, indices, name) pb = model.param[-1] print("pb", pb) expected_pb = Tensor(values, indices, name).to_tensor_pb() self.assertEqual(pb.name, expected_pb.name) self.assertEqual(pb.dim, expected_pb.dim) self.assertEqual(pb.content, expected_pb.content) self.assertEqual(pb.indices, expected_pb.indices) self.assertEqual(pb.dtype, expected_pb.dtype)
def push_gradient_test_setup(self): self.var_names = ["test_1", "test_2"] self.var_values = [ np.array([10.0, 20.0, 30.0], np.float32), np.array([20.0, 40.0, 60.0], np.float32), ] self.grad_values0 = [ np.array([1.0, 2.0, 3.0], np.float32), np.array([2.0, 4.0, 6.0], np.float32), ] self.grad_values1 = [ np.array([0.0, 0.0, 7.0], np.float32), np.array([9.0, 9.0, 6.0], np.float32), ] dim = self._embedding_info.dim self.embedding_table = ( np.random.rand(4 * dim).reshape((4, dim)).astype(np.float32) ) self.embedding_grads0 = tf.IndexedSlices( values=np.random.rand(3 * dim) .reshape((3, dim)) .astype(np.float32), indices=(3, 1, 3), ) self.embedding_grads1 = tf.IndexedSlices( values=np.random.rand(3 * dim) .reshape((3, dim)) .astype(np.float32), indices=(2, 2, 3), ) push_model_req = elasticdl_pb2.Model() push_model_req.version = self._parameters.version for name, value in zip(self.var_names, self.var_values): emplace_tensor_pb_from_ndarray( push_model_req.param, value, name=name ) push_model_req.embedding_table_info.append(self._embedding_info) self._stub.push_model(push_model_req) for name, var in zip(self.var_names, self.var_values): self._parameters.non_embedding_params[name] = tf.Variable(var) self._parameters.embedding_params[self._embedding_info.name].set( range(len(self.embedding_table)), self.embedding_table )
def report_evaluation_metrics(self, model_outputs, labels): """ report evaluation metrics to ps, return (accepted, model_version) from rpc call. """ req = elasticdl_pb2.ReportEvaluationMetricsRequest() for name, output in model_outputs.items(): output = np.concatenate(output) emplace_tensor_pb_from_ndarray(req.model_outputs, output, name=name) labels = np.concatenate(labels) tensor = Tensor(values=labels) serialize_tensor(tensor, req.labels) req.model_version = self._model_version if self._use_multi_ps else -1 res = self._stub.ReportEvaluationMetrics(req) return res.accepted, res.model_version
def to_model_pb(self): """ Convert all parameters including embedding and non-embedding parameters to `elasticdl_pb2.Model` which can be serialized. """ model_pb = elasticdl_pb2.Model() model_pb.version = self.version for name, var in self.non_embedding_params.items(): emplace_tensor_pb_from_ndarray( model_pb.param, var.numpy(), name=name ) for name, embedding_table in self.embedding_params.items(): embedding_table_tensor = embedding_table.to_tensor() tensor_pb = model_pb.param.add() serialize_tensor(embedding_table_tensor, tensor_pb) embedding_info = embedding_table.to_embedding_table_info_pb() model_pb.embedding_table_info.append(embedding_info) return model_pb
def test_push_gradient_async_update(self): self.create_default_server_and_stub() self.push_gradient_test_setup() # Test applying gradients to embedding and non-embedding parameters req = elasticdl_pb2.PushGradientRequest() for g, name in zip(self.grad_values0, self.var_names): emplace_tensor_pb_from_ndarray(req.gradients, g, name=name) emplace_tensor_pb_from_ndarray( req.gradients, values=self.embedding_grads0.values, indices=self.embedding_grads0.indices, name=self._embedding_info.name, ) res = self._stub.push_gradient(req) self.assertEqual(res.accepted, True) self.assertEqual(res.model_version, 1) expected_values = [ v - self._lr * g for v, g in zip(self.var_values, self.grad_values0) ] for name, expected_value in zip(self.var_names, expected_values): self.assertTrue( np.allclose( expected_value, self._parameters.non_embedding_params[name].numpy(), )) expected_embed_table = np.copy(self.embedding_table) for gv, gi in zip(self.embedding_grads0.values, self.embedding_grads0.indices): expected_embed_table[gi] -= self._lr * gv actual_embed_table = self._parameters.get_embedding_param( self._embedding_info.name, range(len(expected_embed_table))) self.assertTrue(np.allclose(expected_embed_table, actual_embed_table)) # Test applying gradients with same name for name, var in zip(self.var_names, self.var_values): self._parameters.non_embedding_params[name] = tf.Variable(var) req = elasticdl_pb2.PushGradientRequest() for g in self.grad_values1: emplace_tensor_pb_from_ndarray(req.gradients, g, name=self.var_names[0]) res = self._stub.push_gradient(req) self.assertEqual(res.accepted, True) self.assertEqual(res.model_version, 2) expected_values = [ self.var_values[0] - self._lr * self.grad_values1[0] - self._lr * self.grad_values1[1], self.var_values[1], ] for expected_value, name in zip(expected_values, self.var_names): self.assertTrue( np.allclose( expected_value, self._parameters.non_embedding_params[name].numpy(), ))
def pull_variable(self, request, _): """ Response with all non-embedding parameters if initialized. """ res = elasticdl_pb2.PullVariableResponse() if not self._parameters.init_status: res.model_init_status = False return res # Only sync-SGD needs lock # TODO: use a read-write lock to support multiple concurrent reads if not self._use_async: self._lock.acquire() res.model.version = self._parameters.version for name, var in self._parameters.non_embedding_params.items(): emplace_tensor_pb_from_ndarray(res.model.param, var.numpy(), name=name) if not self._use_async: self._lock.release() res.model_init_status = True return res
def testReportGradient(self): def makeGrad(): """ Make a ReportGradientRequest compatible with model""" req = elasticdl_pb2.ReportGradientRequest() emplace_tensor_pb_from_ndarray(req.gradient, np.array([0.1], np.float32), name="x") emplace_tensor_pb_from_ndarray(req.gradient, np.array([0.03, 0.06], np.float32), name="y") req.model_version = 1 return req master = MasterServicer( 3, 3, tf.optimizers.SGD(0.1), None, init_var=[], checkpoint_filename_for_init="", checkpoint_service=CheckpointService("", 0, 0, False), evaluation_service=None, ) master._version = 1 master.set_model_var("x", np.array([2.0], dtype=np.float32)) master.set_model_var("y", np.array([12.0, 13.0], dtype=np.float32)) # Report a future version, should raise exception req = makeGrad() req.model_version = 2 self.assertRaises(ValueError, master.ReportGradient, req, None) # Report an old version, should not be accepted req = makeGrad() req.model_version = 0 res = master.ReportGradient(req, None) self.assertFalse(res.accepted) self.assertEqual(1, res.model_version) # Report a unknown gradient, should raise. req = makeGrad() emplace_tensor_pb_from_ndarray(req.gradient, np.array([0.1], np.float32), name="z") self.assertRaises(ValueError, master.ReportGradient, req, None) # Report an incompatible gradient, should raise. req = makeGrad() emplace_tensor_pb_from_ndarray(req.gradient, np.array([0.1], np.float32), name="y") self.assertRaises(ValueError, master.ReportGradient, req, None) # Report a current version, should be accepted. req = makeGrad() res = master.ReportGradient(req, None) self.assertTrue(res.accepted) self.assertEqual(1, res.model_version) # Report a current version with part of gradients, should be accepted. req = makeGrad() req.gradient.pop() res = master.ReportGradient(req, None) self.assertTrue(res.accepted) self.assertEqual(1, res.model_version) # Gradient should be accumulated. np.testing.assert_array_equal(np.array([0.2], dtype=np.float32), master._gradient_sum["x"]) np.testing.assert_array_equal(np.array([0.03, 0.06], dtype=np.float32), master._gradient_sum["y"]) self.assertEqual(2, master._grad_n) # Report a current version, should be accepted, and a new version # created req = makeGrad() res = master.ReportGradient(req, None) self.assertTrue(res.accepted) self.assertEqual(2, res.model_version) self.assertFalse(master._gradient_sum) self.assertEqual(0, master._grad_n) np.testing.assert_array_equal( # [2] - 0.1 * [0.1] np.array([1.99], dtype=np.float32), master._model["x"].numpy(), ) np.testing.assert_array_equal( # [12, 13] - 0.1 * [0.02, 0.04] np.array([11.998, 12.996], dtype=np.float32), master._model["y"].numpy(), )
def report_gradient_to_ps(self, grads): self._timing.start_record_time("report_gradient") reqs = [ elasticdl_pb2.PushGradientRequest() for i in range(self._ps_num) ] ps_grads = {} non_embed_vars_n = len(self._non_embed_vars) for g, v in zip(grads[:non_embed_vars_n], self._non_embed_vars.values()): ps_id = self._var_to_ps[v.name] if ps_id not in ps_grads: ps_grads[ps_id] = [(g, v.name)] else: ps_grads[ps_id].append((g, v.name)) for ps_id in ps_grads: req = reqs[ps_id] for g, name in ps_grads[ps_id]: emplace_tensor_pb_from_ndarray(req.gradients, g, name=name) edl_embedding_name_values = self._collect_edl_embedding_name_values() if edl_embedding_name_values: edl_embedding_grads = grads[non_embed_vars_n:] bet_number = 0 for name, embedding_and_ids in edl_embedding_name_values: bet_number += len(embedding_and_ids) if len(edl_embedding_grads) != bet_number: raise ValueError( "elasticdl.layers.embedding related gradient number %d " "does not match the number of its output tensor %d." % (len(edl_embedding_grads), bet_number)) grad_accum_iter = 0 for name, embedding_and_ids in edl_embedding_name_values: g_values = None g_indices = None for _, ids in embedding_and_ids: grad = edl_embedding_grads[grad_accum_iter] grad_accum_iter += 1 # ElasticDL embedding layer with Sparse Gradients if isinstance(grad, tf.IndexedSlices): grad = grad.values if g_values is not None: g_values = tf.concat([g_values, grad], axis=0) g_indices = tf.concat([g_indices, ids], axis=0) else: g_values = grad g_indices = ids # Sum up the values of the duplicated indices in the # gradients. It can reduce the gradient payload of the # dense embedding. g_values, g_indices = deduplicate_indexed_slices( values=g_values, indices=g_indices) results = scatter_embedding_vector(g_values.numpy(), g_indices.numpy(), self._ps_num) for ps_id in results: req = reqs[ps_id] gv, gi = results[ps_id] emplace_tensor_pb_from_ndarray(req.gradients, values=gv, indices=gi, name=name) report_futures = [] for ps_id in range(self._ps_num): req = reqs[ps_id] req.model_version = self._model_versions_from_ps[ps_id] report_future = self._ps_stubs[ps_id].push_gradient.future(req) report_futures.append(report_future) accepted = False max_version = -1 for report_future in report_futures: res = report_future.result() if res.accepted: accepted = True if res.model_version > max_version: max_version = res.model_version self._timing.end_record_time("report_gradient") return accepted, max_version
def test_push_model(self): opt_func_name = "ftrl_optimizer" opt = load_module(_module_file).__dict__[opt_func_name]() opt_config = opt.get_config() slot_names = ["accumulator", "linear"] slot_init_value = { "accumulator": opt_config["initial_accumulator_value"], "linear": 0.0, } self.create_default_server_and_stub(optimizer=opt_func_name) param0 = { "v0": np.random.rand(3, 2).astype(np.float32), "v1": np.random.rand(10, 32).astype(np.float32), } param1 = { "v0": np.ones([3, 2], dtype=np.float32), "v1": np.ones([10, 32], dtype=np.float32), } models = [param0, param1] for idx, model in enumerate(models): req = elasticdl_pb2.Model() req.version = idx + 1 for name in model: emplace_tensor_pb_from_ndarray(req.param, model[name], name=name) req.embedding_table_info.append(self._embedding_info) res = self._stub.push_model(req) self.assertEqual(res, empty_pb2.Empty()) # self._parameters is initialized with the first push_model call # and the second push_model has no effect self.assertEqual(self._parameters.version, 1) for name in param0: self.assertTrue( np.allclose( param0[name], self._parameters.non_embedding_params[name].numpy(), )) self.assertEqual( self._embedding_info.name, self._parameters.embedding_params[ self._embedding_info.name].name, ) self.assertEqual( self._embedding_info.dim, self._parameters.embedding_params[ self._embedding_info.name].dim, ) self.assertEqual( tf.keras.initializers.get( self._embedding_info.initializer).__class__, self._parameters.embedding_params[ self._embedding_info.name].initializer.__class__, ) for slot_name in slot_names: name = get_slot_table_name(self._embedding_info.name, slot_name) table = self._parameters.embedding_params[name] self.assertTrue(name, table.name) self.assertTrue(self._embedding_info.dim, table.dim) embedding = table.get([2]) self.assertTrue( (embedding - slot_init_value[slot_name] < 0.0001).all())
def test_push_gradient_sync_update(self): self.create_server_and_stub(grads_to_wait=2, lr_staleness_modulation=False, use_async=False) self.push_gradient_test_setup() req = elasticdl_pb2.PushGradientRequest() req.model_version = 0 for g, name in zip(self.grad_values0, self.var_names): emplace_tensor_pb_from_ndarray(req.gradients, g, name=name) emplace_tensor_pb_from_ndarray( req.gradients, values=self.embedding_grads0.values, indices=self.embedding_grads0.indices, name=self._embedding_info.name, ) res = self._stub.push_gradient(req) self.assertEqual(res.accepted, True) self.assertEqual(res.model_version, 0) req = elasticdl_pb2.PushGradientRequest() req.model_version = 0 for g, name in zip(self.grad_values1, self.var_names): emplace_tensor_pb_from_ndarray(req.gradients, g, name=name) emplace_tensor_pb_from_ndarray( req.gradients, values=self.embedding_grads1.values, indices=self.embedding_grads1.indices, name=self._embedding_info.name, ) res = self._stub.push_gradient(req) self.assertEqual(res.accepted, True) self.assertEqual(res.model_version, 1) req = elasticdl_pb2.PushGradientRequest() req.model_version = 0 for g, name in zip(self.grad_values1, self.var_names): emplace_tensor_pb_from_ndarray(req.gradients, g, name=name) res = self._stub.push_gradient(req) self.assertEqual(res.accepted, False) self.assertEqual(res.model_version, 1) expected_values = [ self.var_values[0] - self._lr * (self.grad_values0[0] + self.grad_values1[0]) / 2, self.var_values[1] - self._lr * (self.grad_values0[1] + self.grad_values1[1]) / 2, ] for expected_value, name in zip(expected_values, self.var_names): self.assertTrue( np.allclose( expected_value, self._parameters.non_embedding_params[name].numpy(), )) expected_embed_table = np.copy(self.embedding_table) for gv, gi in zip(self.embedding_grads0.values, self.embedding_grads0.indices): expected_embed_table[gi] -= self._lr * gv for gv, gi in zip(self.embedding_grads1.values, self.embedding_grads1.indices): expected_embed_table[gi] -= self._lr * gv actual_embed_table = self._parameters.get_embedding_param( self._embedding_info.name, range(len(expected_embed_table))) self.assertTrue(np.allclose(expected_embed_table, actual_embed_table))
def _get_model_no_lock(self): pb_model = elasticdl_pb2.Model() pb_model.version = self._version for k, v in self._model.items(): emplace_tensor_pb_from_ndarray(pb_model.param, v.numpy(), name=k) return pb_model
def report_gradient_to_ps(self, grads): reqs = [ elasticdl_pb2.PushGradientRequest() for i in range(len(self._ps_stubs)) ] ps_grads = {} non_embed_vars_n = len(self._non_embed_vars) for g, v in zip(grads[:non_embed_vars_n], self._non_embed_vars.values()): ps_id = self._var_to_ps[v.name] if ps_id not in ps_grads: ps_grads[ps_id] = [(g, v.name)] else: ps_grads[ps_id].append((g, v.name)) for ps_id in ps_grads: req = reqs[ps_id] for g, name in ps_grads[ps_id]: emplace_tensor_pb_from_ndarray(req.gradients, g, name=name) if self._embedding_layers: edl_embedding_grads = grads[non_embed_vars_n:] bet_number = 0 for layer in self._embedding_layers: bet_number += len(layer.embedding_and_ids) if len(edl_embedding_grads) != bet_number: raise ValueError( "elasticdl.layers.embedding related gradient number %d " "does not match the number of its output tensor %d." % (len(edl_embedding_grads), bet_number)) grad_accum_iter = 0 for layer in self._embedding_layers: g_values = None g_indices = None for _, ids in layer.embedding_and_ids: grad = edl_embedding_grads[grad_accum_iter] grad_accum_iter += 1 # ElasticDL embedding layer with Sparse Gradients if isinstance(grad, tf.IndexedSlices): grad = grad.values if g_values is not None: g_values = tf.concat([g_values, grad], axis=0) g_indices = tf.concat([g_indices, ids], axis=0) else: g_values = grad g_indices = ids results = scatter_embedding_vector(g_values.numpy(), g_indices.numpy(), len(self._ps_stubs)) for ps_id in results: req = reqs[ps_id] gv, gi = results[ps_id] emplace_tensor_pb_from_ndarray(req.gradients, values=gv, indices=gi, name=layer.name) report_futures = [] for ps_id in range(len(self._ps_stubs)): req = reqs[ps_id] req.model_version = self._model_version report_future = self._ps_stubs[ps_id].push_gradient.future(req) report_futures.append(report_future) for report_future in report_futures: res = report_future.result() # TODO: choose the last response temporarily return res.accepted, res.model_version