예제 #1
0
    def test_check_grad(self):
        self._clear_params()
        self.params.init_from_model_pb(self.model_pb)

        grad0 = Tensor(name="z")
        with self.assertRaisesRegex(ValueError, "Name error"):
            self.params.check_grad(grad0)

        grad1 = Tensor(name="x", values=np.random.uniform(size=(3, 5)))
        with self.assertRaisesRegex(ValueError, "Non embedding param error"):
            self.params.check_grad(grad1)

        grad2 = Tensor(
            name="embedding_1",
            values=np.random.uniform(size=(3, 11)),
            indices=np.array([1, 2, 3]),
        )
        with self.assertRaisesRegex(ValueError,
                                    "ElasticDL embedding param error"):
            self.params.check_grad(grad2)

        grad3 = Tensor(
            name="x",
            values=np.random.uniform(size=(4, 4)),
            indices=np.array([1, 2, 3, 4]),
        )
        with self.assertRaisesRegex(ValueError, "Keras embedding param error"):
            self.params.check_grad(grad3)
예제 #2
0
    def setUp(self):
        self.params = Parameters()

        self.model_pb = Model()
        self.tensors_pb = self.model_pb.param
        self.embeddings_pb = self.model_pb.embedding_table_info

        arr1 = np.random.uniform(size=(3, 4))
        tensor1_pb = Tensor(arr1, name="x").to_tensor_pb()
        arr2 = np.random.uniform(size=(4, 5))
        tensor2_pb = Tensor(arr2, name="y").to_tensor_pb()
        self.tensors_pb.extend([tensor1_pb, tensor2_pb])

        self.embedding_table_name = "embedding_1"
        self.embedding_dim = 10
        embedding_pb = EmbeddingTableInfo()
        embedding_pb.name = self.embedding_table_name
        embedding_pb.dim = self.embedding_dim
        embedding_pb.initializer = "uniform"

        embedding_vectors = np.random.uniform(size=(2, 10))
        embedding_indices = np.array([0, 8])
        embedding_tensor = Tensor(
            embedding_vectors,
            indices=embedding_indices,
            name=self.embedding_table_name,
        )
        embedding_tensor_pb = embedding_tensor.to_tensor_pb()
        self.tensors_pb.append(embedding_tensor_pb)

        self.embeddings_pb.append(embedding_pb)
예제 #3
0
 def verify(values, name=None, indices=None):
     tensor = Tensor(values, indices, name)
     pb = elasticdl_pb2.Tensor()
     serialize_tensor(tensor, pb)
     tensor_new = Tensor()
     deserialize_tensor_pb(pb, tensor_new)
     np.testing.assert_array_equal(values, tensor_new.values)
     if indices is not None:
         np.testing.assert_array_equal(indices, tensor_new.indices)
     if name:
         self.assertEqual(name, tensor.name)
예제 #4
0
    def testEvaluationJob(self):
        model_version = 1
        total_tasks = 5
        latest_chkp_version = 2
        job = EvaluationJob(_eval_metrics_fn(), model_version, total_tasks)
        self.assertEqual(0, job._completed_tasks)
        self.assertFalse(job.finished())
        self.assertFalse(self.ok_to_new_job(job, latest_chkp_version))

        # Now make 4 tasks finished
        for i in range(4):
            job.complete_task()
        self.assertEqual(4, job._completed_tasks)
        self.assertFalse(job.finished())
        self.assertFalse(self.ok_to_new_job(job, latest_chkp_version))

        # One more task finishes
        job.complete_task()
        self.assertEqual(5, job._completed_tasks)
        self.assertTrue(job.finished())
        self.assertTrue(self.ok_to_new_job(job, latest_chkp_version))

        # No new model checkpoint
        latest_chkp_version = job.model_version
        self.assertFalse(self.ok_to_new_job(job, latest_chkp_version))
        latest_chkp_version = job.model_version + 1
        self.assertTrue(self.ok_to_new_job(job, latest_chkp_version))

        model_outputs = [
            Tensor(
                np.array([[1], [6], [3]], np.float32),
                name=MetricsDictKey.MODEL_OUTPUT,
            ).to_tensor_pb()
        ]
        labels = Tensor(np.array([[1], [0], [3]], np.float32)).to_tensor_pb()
        job.report_evaluation_metrics(model_outputs, labels)
        job.report_evaluation_metrics(
            [
                Tensor(
                    np.array([[4], [5], [6], [7], [8]], np.float32),
                    name=MetricsDictKey.MODEL_OUTPUT,
                ).to_tensor_pb()
            ],
            Tensor(np.array([[7], [8], [9], [10], [11]],
                            np.float32)).to_tensor_pb(),
        )
        expected_acc = 0.25
        evaluation_metrics = job.get_evaluation_summary()
        self.assertAlmostEqual(expected_acc,
                               evaluation_metrics.get("acc").numpy())
        self.assertAlmostEqual(expected_acc,
                               evaluation_metrics.get("acc_fn").numpy())
        self.assertAlmostEqual(10.125, evaluation_metrics.get("mse").numpy())
예제 #5
0
파일: worker.py 프로젝트: zuodh/elasticdl
    def get_model(self):
        model_version = -1
        variable_future_and_id_pairs = []
        req = empty_pb2.Empty()
        if self._use_multi_ps:
            self.init_ps_var_partition()
        for ps_id, stub in enumerate(self._ps_stubs):
            if ps_id not in self._ps_vars:
                continue
            # async grpc call
            var_future = stub.pull_variable.future(req)
            variable_future_and_id_pairs.append((var_future, ps_id))

        for var_future, ps_id in variable_future_and_id_pairs:
            res = var_future.result()
            if not res.model_init_status:
                # push variable to ps for initialization
                self.report_variable_to_ps(ps_id)
                res = self._ps_stubs[ps_id].pull_variable(req)
                if not res.model_init_status:
                    # TODO: support PS fault-tolerance
                    raise RuntimeError("PS pod %d cannot be initialized" %
                                       ps_id)

            for tensor_pb in res.model.param:
                tensor = Tensor.from_tensor_pb(tensor_pb)
                self._non_embed_vars[tensor.name].assign(tensor.to_ndarray())

            model_version = max(model_version, res.model.version)
        self._model_version = model_version
예제 #6
0
    def get_model(self):
        self._timing.start_record_time("get_model")
        variable_future_and_id_pairs = []
        if self._use_multi_ps:
            self.init_ps_var_partition()
        for ps_id, stub in enumerate(self._ps_stubs):
            if ps_id not in self._ps_vars:
                continue
            # async grpc call
            req = elasticdl_pb2.PullVariableRequest()
            req.current_model_version = self._model_versions_from_ps[ps_id]
            var_future = stub.pull_variable.future(req)
            variable_future_and_id_pairs.append((var_future, ps_id))

        for var_future, ps_id in variable_future_and_id_pairs:
            res = var_future.result()
            if not res.model_init_status:
                # push variable to ps for initialization
                self.report_variable_to_ps(ps_id)
                req = elasticdl_pb2.PullVariableRequest()
                req.current_model_version = self._model_versions_from_ps[ps_id]
                res = self._ps_stubs[ps_id].pull_variable(req)
                if not res.model_init_status:
                    # TODO: support PS fault-tolerance
                    raise RuntimeError("PS pod %d cannot be initialized" %
                                       ps_id)

            for tensor_pb in res.model.param:
                tensor = Tensor.from_tensor_pb(tensor_pb)
                self._non_embed_vars[tensor.name].assign(tensor.to_ndarray())
            self._model_versions_from_ps[ps_id] = res.model.version

        self._model_version = max(self._model_versions_from_ps)
        self._timing.end_record_time("get_model")
예제 #7
0
def get_params_shard_from_pb(model_pb, shard_index, shard_num):
    """Get parameters including variables values and embedding table
    from a model protobuf.

    Args:
        model_pb: A Model protobuf instance.
        shard_index: Model shard index.
        shard_num: The total number of model shards.

    Return:
        non_embedding_vars: A Python dict in which the key is a variable
            name and the value is a `tf.Variable` object.
        embedding_table_values: A Python dict in which the key is an embedding
            table name and the value is a tuple with 2 elements. The value[0]
            is indices and value[1] is the corresponding embedding vector.
    """
    non_embedding_vars = {}
    embedding_table_values = {}

    for tensor_pb in model_pb.param:
        tensor = Tensor.from_tensor_pb(tensor_pb)
        if tensor.indices is not None:
            embedding_table_values.setdefault(tensor.name, ([], []))
            for embedding_id, vector in zip(tensor.indices, tensor.values):
                if int_to_id(embedding_id, shard_num) == shard_index:
                    embedding_table_values[tensor.name][0].append(embedding_id)
                    embedding_table_values[tensor.name][1].append(vector)
        else:
            if string_to_id(tensor.name, shard_num) == shard_index:
                non_embedding_vars[tensor.name] = tf.Variable(
                    initial_value=tensor.values, trainable=True)
    return non_embedding_vars, embedding_table_values
예제 #8
0
 def pull_embedding_vector(self, request, _):
     ret = elasticdl_pb2.Tensor()
     if not request.ids:
         return ret
     embedding_vectors = self._parameters.get_embedding_param(
         request.name, request.ids)
     tensor = Tensor(values=embedding_vectors)
     serialize_tensor(tensor, ret)
     return ret
예제 #9
0
 def to_tensor(self):
     """Convert the embedding table to elasticDL Tensor"""
     indices = []
     embedding_vectors = []
     for id, embedding_vector in self.embedding_vectors.items():
         indices.append(id)
         embedding_vectors.append(embedding_vector)
     return Tensor(
         values=np.array(embedding_vectors),
         indices=np.array(indices),
         name=self.name,
     )
예제 #10
0
 def _get_non_embedding_variables(self, version, method):
     """Get model from master, and update model_version
     """
     req = elasticdl_pb2.GetModelRequest()
     req.version = version
     req.method = method
     model = self._stub.GetModel(req, None)
     variables = {}
     for tensor_pb in model.param:
         tensor = Tensor.from_tensor_pb(tensor_pb)
         variables[tensor.name] = tensor.to_ndarray()
     return variables
예제 #11
0
    def test_deserialize_tensor_pb(self):
        pb = elasticdl_pb2.Tensor()
        tensor = Tensor()
        # No dim defined, should raise.
        self.assertRaises(ValueError, deserialize_tensor_pb, pb, tensor)

        # Empty array, should be ok.
        pb.dim.append(0)
        pb.content = b""
        pb.dtype = tensor_dtype_pb2.DT_FLOAT32
        deserialize_tensor_pb(pb, tensor)
        np.testing.assert_array_equal(np.array([], dtype=np.float32),
                                      tensor.values)

        # Wrong type, should raise
        del pb.dim[:]
        pb.dim.append(0)
        pb.content = b""
        pb.dtype = tensor_dtype_pb2.DT_INVALID
        self.assertRaises(ValueError, deserialize_tensor_pb, pb, tensor)

        # Pathological case, one of the dimensions is 0.
        del pb.dim[:]
        pb.dim.extend([2, 0, 1, 9])
        pb.content = b""
        pb.dtype = tensor_dtype_pb2.DT_FLOAT32
        deserialize_tensor_pb(pb, tensor)
        np.testing.assert_array_equal(
            np.ndarray(shape=[2, 0, 1, 9], dtype=np.float32), tensor.values)

        # Wrong content size, should raise
        del pb.dim[:]
        pb.dim.append(11)
        pb.content = b"\0" * (4 * 12)
        pb.dtype = tensor_dtype_pb2.DT_FLOAT32
        self.assertRaises(ValueError, deserialize_tensor_pb, pb, tensor)

        # Compatible dimensions, should be ok.
        for m in (1, 2, 3, 4, 6, 12):
            for with_inidices in [True, False]:
                del pb.dim[:]
                pb.content = b"\0" * (4 * 12)
                pb.dim.extend([m, 12 // m])
                if with_inidices:
                    pb.indices.extend([0] * m)
                pb.dtype = tensor_dtype_pb2.DT_FLOAT32
                deserialize_tensor_pb(pb, tensor)
                self.assertEqual((m, 12 // m), tensor.values.shape)
                self.assertTrue(isinstance(tensor.values, np.ndarray))
                if tensor.indices is not None:
                    self.assertTrue(isinstance(tensor.indices, np.ndarray))
예제 #12
0
    def get_model_from_master(self, version, method):
        """
        get model from master, and update model_version
        """
        req = elasticdl_pb2.GetModelRequest()
        req.version = version
        req.method = method
        model = self._stub.GetModel(req)

        # Assumes all trainable variables exist in model.param.
        for tensor_pb in model.param:
            tensor = Tensor.from_tensor_pb(tensor_pb)
            self._non_embed_vars[tensor.name].assign(tensor.to_ndarray())
        self._model_version = model.version
예제 #13
0
 def report_evaluation_metrics(self, model_outputs, labels):
     """
     report evaluation metrics to ps.
     """
     req = elasticdl_pb2.ReportEvaluationMetricsRequest()
     for name, output in model_outputs.items():
         output = np.concatenate(output)
         emplace_tensor_pb_from_ndarray(req.model_outputs,
                                        output,
                                        name=name)
     labels = np.concatenate(labels)
     tensor = Tensor(values=labels)
     serialize_tensor(tensor, req.labels)
     self._stub.report_evaluation_metrics(req)
예제 #14
0
    def test_emplace_tensor_pb_from_ndarray(self):
        values = np.array([[0.1, 0.2, 0.3], [0.4, 0.5, 0.6]], np.float32)
        indices = np.array([0, 2])
        name = "test"
        model = elasticdl_pb2.Model()
        emplace_tensor_pb_from_ndarray(model.param, values, indices, name)
        pb = model.param[-1]
        print("pb", pb)

        expected_pb = Tensor(values, indices, name).to_tensor_pb()
        self.assertEqual(pb.name, expected_pb.name)
        self.assertEqual(pb.dim, expected_pb.dim)
        self.assertEqual(pb.content, expected_pb.content)
        self.assertEqual(pb.indices, expected_pb.indices)
        self.assertEqual(pb.dtype, expected_pb.dtype)
예제 #15
0
 def _restore_params_from_pb(self, tensors_pb):
     for pb in tensors_pb:
         name = pb.name
         if not pb.indices:
             # Please note that `tf.Variable` will do something with magic.
             # If you pass a name "somename" to a `tf.Variable`, the final
             # variable name will be "somename:0". So the `tf.Variable.name`
             # is meaningless, we must avoid use it in PS side.
             arr = tensor_pb_to_ndarray(pb)
             var = tf.Variable(initial_value=arr, trainable=True)
             self.non_embedding_params[name] = var
         else:
             # Only pb of embedding parameters has indices.
             tensor = Tensor()
             deserialize_tensor_pb(pb, tensor)
             self.embedding_params[name].set(tensor.indices, tensor.values)
예제 #16
0
 def report_evaluation_metrics(self, model_outputs, labels):
     """
     report evaluation metrics to ps, return (accepted, model_version)
     from rpc call.
     """
     req = elasticdl_pb2.ReportEvaluationMetricsRequest()
     for name, output in model_outputs.items():
         output = np.concatenate(output)
         emplace_tensor_pb_from_ndarray(req.model_outputs,
                                        output,
                                        name=name)
     labels = np.concatenate(labels)
     tensor = Tensor(values=labels)
     serialize_tensor(tensor, req.labels)
     req.model_version = self._model_version if self._use_multi_ps else -1
     res = self._stub.ReportEvaluationMetrics(req)
     return res.accepted, res.model_version
예제 #17
0
    def ReportGradient(self, request, _):
        model_version_valid = self._use_async or self._validate_model_version(
            request.model_version)

        res = elasticdl_pb2.ReportGradientResponse()
        if not model_version_valid:
            logger.warning(
                "Task result for outdated version %d dropped",
                request.model_version,
            )
            res.accepted = False
            res.model_version = self._version
            return res

        non_embedding_gradients = {}
        indexed_grads = {}
        edl_embedding_gradients = {}
        # Do sanity check before accumulating gradients.
        for v in request.gradient:
            tensor = Tensor.from_tensor_pb(v)
            name = tensor.name
            if name not in self._model:
                if tensor.is_indexed_slices():
                    # grads of ElasticDL Embedding layer
                    # TODO: check arr.shape[1] = embedding_dim of this
                    # EdlEmbedding layer
                    edl_embedding_gradients[name] = tensor.to_tf_tensor()
                    continue
                else:
                    raise ValueError("Gradient key: %s is not part of model",
                                     name)

            if tensor.is_indexed_slices():
                if (tensor.values.shape[1] !=
                        self._model[name].numpy().shape[1]):
                    raise ValueError(
                        "Gradient key: %s has incompatible "
                        "indexed slice dimension %d, expected %d" % (
                            name,
                            tensor.values.shape[1],
                            self._model[name].numpy().shape[1],
                        ))

                max_index = tf.math.reduce_max(tensor.indices).numpy()
                if max_index >= self._model[name].numpy().shape[0]:
                    raise ValueError(
                        "Gradient key: %s has wrong indices %d, "
                        "out of range %d" % (
                            name,
                            max_index,
                            self._model[name].numpy().shape[0] - 1,
                        ))
                indexed_grads[name] = tensor.to_tf_tensor()
            else:
                if tensor.values.shape != self._model[name].numpy().shape:
                    raise ValueError(
                        "Gradient key: %s has incompatible dimension", name)
                non_embedding_gradients[name] = tensor.to_tf_tensor()

        if not self._use_async:
            self._lock.acquire()
        self._process_gradients(
            edl_embedding_gradients,
            indexed_grads,
            non_embedding_gradients,
            request.model_version,
        )
        if not self._use_async:
            self._lock.release()

        res.accepted = True
        res.model_version = self._version
        return res
예제 #18
0
    def push_gradient(self, request, _):
        res = elasticdl_pb2.PushGradientResponse()
        if self._use_async:
            grad_vars = []
            for pb in request.gradients:
                grad = Tensor.from_tensor_pb(pb)
                self._parameters.check_grad(grad)
                name = grad.name
                var = self._parameters.get_non_embedding_param(name)
                grad = grad.to_tf_tensor()
                if var is None:
                    grad_vars.append((grad, name))
                else:
                    grad_vars.append((grad, var))

            if self._lr_scheduler:
                self._lr_scheduler.set_model_version(self._parameters.version)
            self._optimizer.apply_gradients(grad_vars)
            with self._version_lock:
                self._parameters.version += 1
                self._save_params_to_checkpoint_if_needed()
                version = self._parameters.version
            self._report_version_if_needed(version)

            res.accepted = True
            res.model_version = self._parameters.version
            return res
        else:
            if (request.model_version <
                    self._parameters.version - self._sync_version_tolerance):
                res.accepted = False
                res.model_version = self._parameters.version
                return res

            with self._lock:
                for pb in request.gradients:
                    grad = Tensor.from_tensor_pb(pb)
                    self._parameters.check_grad(grad)
                    if grad.name in self._grads_buffer:
                        self._grads_buffer[grad.name] = (
                            self._grads_buffer[grad.name] + grad)
                    else:
                        self._grads_buffer[grad.name] = grad

                self._grads_n += 1
                res.accepted = True

                updated_version = False
                version = self._parameters.version
                if self._grads_n == self._grads_to_wait:
                    grad_vars = []
                    for name, grad in self._grads_buffer.items():
                        # Dense gradients are averaged,
                        # while sparse gradients are summed
                        if not grad.is_indexed_slices():
                            grad.values = grad.values / self._grads_to_wait
                        var = self._parameters.get_non_embedding_param(name)
                        grad = grad.to_tf_tensor()
                        if var is None:
                            grad_vars.append((grad, name))
                        else:
                            grad_vars.append((grad, var))

                    if self._lr_scheduler:
                        self._lr_scheduler.set_model_version(
                            self._parameters.version)
                    self._optimizer.apply_gradients(grad_vars)
                    self._grads_n = 0
                    self._grads_buffer.clear()
                    self._parameters.version += 1
                    self._save_params_to_checkpoint_if_needed()
                    version = self._parameters.version
                    updated_version = True

            if updated_version:
                self._report_version_if_needed(version)
            res.model_version = version
            return res
예제 #19
0
    def test_tensor_data_structure(self):
        # Test tensor values, without indices
        arr = np.ndarray(shape=[3, 1, 2, 4], dtype=np.int32)
        tensor = Tensor(arr)
        self.assertTrue(np.array_equal(arr, tensor.values))
        self.assertTrue(np.array_equal(arr, tensor.to_tf_tensor()))
        self.assertFalse(tensor.is_indexed_slices())

        # Test tensor values, with indices
        indices = np.array([2, 0, 1])
        tensor = Tensor(arr, indices)
        self.assertTrue(np.array_equal(arr, tensor.values))
        self.assertTrue(np.array_equal(indices, tensor.indices))
        self.assertTrue(np.array_equal(arr, tensor.to_tf_tensor().values))
        self.assertTrue(np.array_equal(indices, tensor.to_tf_tensor().indices))
        self.assertTrue(tensor.is_indexed_slices())

        # Test round trip
        # tensor to tensor PB
        tensor = Tensor(arr, indices, name="test")
        pb = tensor.to_tensor_pb()
        self.assertEqual(pb.name, "test")
        self.assertEqual(pb.dim, [3, 1, 2, 4])
        self.assertEqual(pb.dtype, tensor_dtype_pb2.DT_INT32)
        np.testing.assert_array_equal(pb.indices, indices)

        # tensor PB to tensor
        tensor_new = Tensor.from_tensor_pb(pb)
        self.assertEqual(tensor.name, "test")
        np.testing.assert_array_equal(tensor_new.values, arr)
        np.testing.assert_array_equal(tensor_new.indices, indices)

        # Test Tensor().to_ndarray()
        values = np.array([[1.0, 2.0], [3.0, 4.0]])
        indices = np.array([0, 2])
        name = "test"
        tensor = Tensor(values, indices, name)
        self.assertRaises(NotImplementedError, tensor.to_ndarray)
        tensor = Tensor(values, name=name)
        self.assertTrue(np.allclose(values, tensor.to_ndarray()))
예제 #20
0
    def testEvaluationService(self):
        with tempfile.TemporaryDirectory() as tempdir:
            chkp_dir = os.path.join(tempdir, "testEvaluationService")
            checkpoint_service = CheckpointService(chkp_dir, 5, 5, True)
            task_d = _TaskDispatcher(
                {
                    "f1": (0, 10),
                    "f2": (0, 10)
                },
                {
                    "f1": (0, 10),
                    "f2": (0, 10)
                },
                {},
                3,
                1,
            )

            # Evaluation metrics will not be accepted if no evaluation ongoing
            evaluation_service = EvaluationService(
                checkpoint_service,
                None,
                task_d,
                10,
                20,
                0,
                False,
                _eval_metrics_fn,
            )
            model_outputs = [
                Tensor(
                    np.array([1, 6, 3], np.float32),
                    name=MetricsDictKey.MODEL_OUTPUT,
                ).to_tensor_pb()
            ]
            labels = Tensor(np.array([1, 0, 3], np.float32)).to_tensor_pb()

            self.assertFalse(
                evaluation_service.report_evaluation_metrics(
                    1, model_outputs, labels))

            # No checkpoint available
            self.assertFalse(evaluation_service.try_to_create_new_job())

            master = MasterServicer(
                2,
                2,
                None,
                task_d,
                init_var=[],
                checkpoint_filename_for_init="",
                checkpoint_service=checkpoint_service,
                evaluation_service=evaluation_service,
            )
            master.set_model_var("x", np.array([1.0, 1.0], dtype=np.float32))

            # Add an evaluation task and we can start evaluation
            self.assertEqual(8, len(task_d._todo))
            evaluation_service.add_evaluation_task(False)
            self.assertEqual(8, len(task_d._eval_todo))
            self.assertFalse(evaluation_service._eval_job.finished())

            for i in range(8):
                self.assertFalse(evaluation_service._eval_job.finished())
                evaluation_service.complete_task()
            self.assertTrue(evaluation_service._eval_job is None)
            self.assertFalse(evaluation_service.try_to_create_new_job())
예제 #21
0
 def _ndarray_to_tensor_pb(values, name=None, indices=None):
     return Tensor(values, indices, name).to_tensor_pb()