def test_push_gradient_async_update(self): self.create_default_server_and_stub() self.push_gradient_test_setup() # Test applying gradients to embedding and non-embedding parameters req = elasticdl_pb2.PushGradientRequest() for g, name in zip(self.grad_values0, self.var_names): emplace_tensor_pb_from_ndarray(req.gradients, g, name=name) emplace_tensor_pb_from_ndarray( req.gradients, values=self.embedding_grads0.values, indices=self.embedding_grads0.indices, name=self._embedding_info.name, ) res = self._stub.push_gradient(req) self.assertEqual(res.accepted, True) self.assertEqual(res.model_version, 1) expected_values = [ v - self._lr * g for v, g in zip(self.var_values, self.grad_values0) ] for name, expected_value in zip(self.var_names, expected_values): self.assertTrue( np.allclose( expected_value, self._parameters.non_embedding_params[name].numpy(), )) expected_embed_table = np.copy(self.embedding_table) for gv, gi in zip(self.embedding_grads0.values, self.embedding_grads0.indices): expected_embed_table[gi] -= self._lr * gv actual_embed_table = self._parameters.get_embedding_param( self._embedding_info.name, range(len(expected_embed_table))) self.assertTrue(np.allclose(expected_embed_table, actual_embed_table)) # Test applying gradients with same name for name, var in zip(self.var_names, self.var_values): self._parameters.non_embedding_params[name] = tf.Variable(var) req = elasticdl_pb2.PushGradientRequest() for g in self.grad_values1: emplace_tensor_pb_from_ndarray(req.gradients, g, name=self.var_names[0]) res = self._stub.push_gradient(req) self.assertEqual(res.accepted, True) self.assertEqual(res.model_version, 2) expected_values = [ self.var_values[0] - self._lr * self.grad_values1[0] - self._lr * self.grad_values1[1], self.var_values[1], ] for expected_value, name in zip(expected_values, self.var_names): self.assertTrue( np.allclose( expected_value, self._parameters.non_embedding_params[name].numpy(), ))
def report_gradient_to_ps(self, grads): self._timing.start_record_time("report_gradient") reqs = [ elasticdl_pb2.PushGradientRequest() for i in range(self._ps_num) ] ps_grads = {} non_embed_vars_n = len(self._non_embed_vars) for g, v in zip(grads[:non_embed_vars_n], self._non_embed_vars.values()): ps_id = self._var_to_ps[v.name] if ps_id not in ps_grads: ps_grads[ps_id] = [(g, v.name)] else: ps_grads[ps_id].append((g, v.name)) for ps_id in ps_grads: req = reqs[ps_id] for g, name in ps_grads[ps_id]: emplace_tensor_pb_from_ndarray(req.gradients, g, name=name) edl_embedding_name_values = self._collect_edl_embedding_name_values() if edl_embedding_name_values: edl_embedding_grads = grads[non_embed_vars_n:] bet_number = 0 for name, embedding_and_ids in edl_embedding_name_values: bet_number += len(embedding_and_ids) if len(edl_embedding_grads) != bet_number: raise ValueError( "elasticdl.layers.embedding related gradient number %d " "does not match the number of its output tensor %d." % (len(edl_embedding_grads), bet_number)) grad_accum_iter = 0 for name, embedding_and_ids in edl_embedding_name_values: g_values = None g_indices = None for _, ids in embedding_and_ids: grad = edl_embedding_grads[grad_accum_iter] grad_accum_iter += 1 # ElasticDL embedding layer with Sparse Gradients if isinstance(grad, tf.IndexedSlices): grad = grad.values if g_values is not None: g_values = tf.concat([g_values, grad], axis=0) g_indices = tf.concat([g_indices, ids], axis=0) else: g_values = grad g_indices = ids # Sum up the values of the duplicated indices in the # gradients. It can reduce the gradient payload of the # dense embedding. g_values, g_indices = deduplicate_indexed_slices( values=g_values, indices=g_indices) results = scatter_embedding_vector(g_values.numpy(), g_indices.numpy(), self._ps_num) for ps_id in results: req = reqs[ps_id] gv, gi = results[ps_id] emplace_tensor_pb_from_ndarray(req.gradients, values=gv, indices=gi, name=name) report_futures = [] for ps_id in range(self._ps_num): req = reqs[ps_id] req.model_version = self._model_versions_from_ps[ps_id] report_future = self._ps_stubs[ps_id].push_gradient.future(req) report_futures.append(report_future) accepted = False max_version = -1 for report_future in report_futures: res = report_future.result() if res.accepted: accepted = True if res.model_version > max_version: max_version = res.model_version self._timing.end_record_time("report_gradient") return accepted, max_version
def test_push_gradient_sync_update(self): self.create_server_and_stub(grads_to_wait=2, lr_staleness_modulation=False, use_async=False) self.push_gradient_test_setup() req = elasticdl_pb2.PushGradientRequest() req.model_version = 0 for g, name in zip(self.grad_values0, self.var_names): emplace_tensor_pb_from_ndarray(req.gradients, g, name=name) emplace_tensor_pb_from_ndarray( req.gradients, values=self.embedding_grads0.values, indices=self.embedding_grads0.indices, name=self._embedding_info.name, ) res = self._stub.push_gradient(req) self.assertEqual(res.accepted, True) self.assertEqual(res.model_version, 0) req = elasticdl_pb2.PushGradientRequest() req.model_version = 0 for g, name in zip(self.grad_values1, self.var_names): emplace_tensor_pb_from_ndarray(req.gradients, g, name=name) emplace_tensor_pb_from_ndarray( req.gradients, values=self.embedding_grads1.values, indices=self.embedding_grads1.indices, name=self._embedding_info.name, ) res = self._stub.push_gradient(req) self.assertEqual(res.accepted, True) self.assertEqual(res.model_version, 1) req = elasticdl_pb2.PushGradientRequest() req.model_version = 0 for g, name in zip(self.grad_values1, self.var_names): emplace_tensor_pb_from_ndarray(req.gradients, g, name=name) res = self._stub.push_gradient(req) self.assertEqual(res.accepted, False) self.assertEqual(res.model_version, 1) expected_values = [ self.var_values[0] - self._lr * (self.grad_values0[0] + self.grad_values1[0]) / 2, self.var_values[1] - self._lr * (self.grad_values0[1] + self.grad_values1[1]) / 2, ] for expected_value, name in zip(expected_values, self.var_names): self.assertTrue( np.allclose( expected_value, self._parameters.non_embedding_params[name].numpy(), )) expected_embed_table = np.copy(self.embedding_table) for gv, gi in zip(self.embedding_grads0.values, self.embedding_grads0.indices): expected_embed_table[gi] -= self._lr * gv for gv, gi in zip(self.embedding_grads1.values, self.embedding_grads1.indices): expected_embed_table[gi] -= self._lr * gv actual_embed_table = self._parameters.get_embedding_param( self._embedding_info.name, range(len(expected_embed_table))) self.assertTrue(np.allclose(expected_embed_table, actual_embed_table))
def report_gradient_to_ps(self, grads): reqs = [ elasticdl_pb2.PushGradientRequest() for i in range(len(self._ps_stubs)) ] ps_grads = {} non_embed_vars_n = len(self._non_embed_vars) for g, v in zip(grads[:non_embed_vars_n], self._non_embed_vars.values()): ps_id = self._var_to_ps[v.name] if ps_id not in ps_grads: ps_grads[ps_id] = [(g, v.name)] else: ps_grads[ps_id].append((g, v.name)) for ps_id in ps_grads: req = reqs[ps_id] for g, name in ps_grads[ps_id]: emplace_tensor_pb_from_ndarray(req.gradients, g, name=name) if self._embedding_layers: edl_embedding_grads = grads[non_embed_vars_n:] bet_number = 0 for layer in self._embedding_layers: bet_number += len(layer.embedding_and_ids) if len(edl_embedding_grads) != bet_number: raise ValueError( "elasticdl.layers.embedding related gradient number %d " "does not match the number of its output tensor %d." % (len(edl_embedding_grads), bet_number)) grad_accum_iter = 0 for layer in self._embedding_layers: g_values = None g_indices = None for _, ids in layer.embedding_and_ids: grad = edl_embedding_grads[grad_accum_iter] grad_accum_iter += 1 # ElasticDL embedding layer with Sparse Gradients if isinstance(grad, tf.IndexedSlices): grad = grad.values if g_values is not None: g_values = tf.concat([g_values, grad], axis=0) g_indices = tf.concat([g_indices, ids], axis=0) else: g_values = grad g_indices = ids results = scatter_embedding_vector(g_values.numpy(), g_indices.numpy(), len(self._ps_stubs)) for ps_id in results: req = reqs[ps_id] gv, gi = results[ps_id] emplace_tensor_pb_from_ndarray(req.gradients, values=gv, indices=gi, name=layer.name) report_futures = [] for ps_id in range(len(self._ps_stubs)): req = reqs[ps_id] req.model_version = self._model_version report_future = self._ps_stubs[ps_id].push_gradient.future(req) report_futures.append(report_future) for report_future in report_futures: res = report_future.result() # TODO: choose the last response temporarily return res.accepted, res.model_version