예제 #1
0
    def test_ndarray_to_tensor(self):
        # Wrong type, should raise
        arr = np.array([1, 2, 3, 4])
        self.assertRaises(ValueError, ndarray_to_tensor, arr)

        # Empty array
        arr = np.array([], dtype=np.float32)
        t = ndarray_to_tensor(arr)
        self.assertEqual([0], t.dim)
        self.assertEqual(0, len(t.content))

        # Pathological case, one of the dimensions is 0.
        arr = np.ndarray(shape=[2, 0, 1, 9], dtype=np.float32)
        t = ndarray_to_tensor(arr)
        self.assertEqual([2, 0, 1, 9], t.dim)
        self.assertEqual(0, len(t.content))

        # 1-D array
        arr = np.array([1.0, 2.0, 3.0, 4.0], dtype=np.float32)
        t = ndarray_to_tensor(arr)
        self.assertEqual([4], t.dim)
        self.assertEqual(4 * 4, len(t.content))

        # 4-D random array
        arr = np.ndarray(shape=[2, 1, 3, 4], dtype=np.float32)
        t = ndarray_to_tensor(arr)
        self.assertEqual([2, 1, 3, 4], t.dim)
        self.assertEqual(4 * 2 * 1 * 3 * 4, len(t.content))
예제 #2
0
    def report_gradient(self, grads):
        """
        report gradient to ps, return (accepted, model_version) from rpc call.
        """
        req = elasticdl_pb2.ReportGradientRequest()
        origin_vars = self._model.trainable_variables
        origin_var_n = len(origin_vars)
        # should keep the same order as self.get_trainable_items()
        for g, v in zip(grads[:origin_var_n], origin_vars):
            if isinstance(g, tf.IndexedSlices):
                req.gradient[v.name].CopyFrom(
                    ndarray_to_tensor(
                        g.values.numpy(), tuple(g.indices.numpy())
                    )
                )
            else:
                req.gradient[v.name].CopyFrom(ndarray_to_tensor(g.numpy()))

        # deal with gradients of ElasticDL embedding layer
        # should keep the same order as self.get_trainable_items()
        if self._embedding_layers:
            grads_edlembedding = grads[origin_var_n:]

            bet_number = 0
            for layer in self._embedding_layers:
                bet_number += len(layer.bet_ids_pair)
            if len(grads_edlembedding) != bet_number:
                raise ValueError(
                    "elasticdl.layers.embedding related gradient number %d "
                    "does not match the number of its output tensor %d."
                    % (len(grads_edlembedding), bet_number)
                )

            it = 0
            for layer in self._embedding_layers:
                g_values = None
                g_indices = None
                for bet, ids in layer.bet_ids_pair:
                    grad = grads_edlembedding[it]
                    it += 1
                    # ElasticDL embedding layer with Sparse Gradients
                    if isinstance(grad, tf.IndexedSlices):
                        grad = grad.values
                    if g_values is not None:
                        g_values = tf.concat([g_values, grad], axis=0)
                        g_indices = tf.concat([g_indices, ids], axis=0)
                    else:
                        g_values = grad
                        g_indices = ids

                req.gradient[layer.name].CopyFrom(
                    ndarray_to_tensor(
                        g_values.numpy(), tuple(g_indices.numpy())
                    )
                )

        req.model_version = self._model_version
        res = self._stub.ReportGradient(req)
        return res.accepted, res.model_version
예제 #3
0
    def testEvaluationJob(self):
        model_version = 1
        total_tasks = 5
        latest_chkp_version = 2
        job = _EvaluationJob(model_version, total_tasks)
        self.assertEqual(0, job._completed_tasks)
        self.assertFalse(job.finished())
        self.assertFalse(self.ok_to_new_job(job, latest_chkp_version))

        # Now make 4 tasks finished
        for i in range(4):
            job.complete_task()
        self.assertEqual(4, job._completed_tasks)
        self.assertFalse(job.finished())
        self.assertFalse(self.ok_to_new_job(job, latest_chkp_version))

        # One more task finishes
        job.complete_task()
        self.assertEqual(5, job._completed_tasks)
        self.assertTrue(job.finished())
        self.assertTrue(self.ok_to_new_job(job, latest_chkp_version))

        # No new model checkpoint
        latest_chkp_version = job.model_version
        self.assertFalse(self.ok_to_new_job(job, latest_chkp_version))
        latest_chkp_version = job.model_version + 1
        self.assertTrue(self.ok_to_new_job(job, latest_chkp_version))

        # At the beginning, no metrics
        self.assertFalse(job._evaluation_metrics)

        # Start to report metrics
        evaluation_version = job.model_version + 1
        evaluation_metrics = {
            "mse": ndarray_to_tensor(np.array([100, 200], dtype=np.float32))
        }
        self.assertFalse(
            job.report_evaluation_metrics(
                evaluation_version, evaluation_metrics
            )
        )
        self.assertFalse(job._evaluation_metrics)
        evaluation_version = job.model_version
        self.assertTrue(
            job.report_evaluation_metrics(
                evaluation_version, evaluation_metrics
            )
        )
        # One more
        evaluation_metrics = {
            "mse": ndarray_to_tensor(np.array([300, 400], dtype=np.float32))
        }
        job.report_evaluation_metrics(evaluation_version, evaluation_metrics)
        self.assertTrue(
            np.array_equal(
                np.array([200, 300], dtype=np.float32),
                job.get_evaluation_summary().get("mse"),
            )
        )
예제 #4
0
    def report_gradient(self, grads):
        """
        report gradient to ps, return (accepted, model_version) from rpc call.
        """
        req = elasticdl_pb2.ReportGradientRequest()
        non_embed_vars_n = len(self._non_embed_vars)
        # The first `non_embed_vars_n` items in `grads` are gradients for
        # `self._non_embed_vars`
        for g, v in zip(grads[:non_embed_vars_n], self._non_embed_vars):
            if isinstance(g, tf.IndexedSlices):
                req.gradient[v.name].CopyFrom(
                    ndarray_to_tensor(g.values.numpy(),
                                      tuple(g.indices.numpy())))
            else:
                req.gradient[v.name].CopyFrom(ndarray_to_tensor(g.numpy()))

        # Accumulate gradients of ElasticDL embedding layer
        if self._embedding_layers:
            # The `edl_embedding_grads` are gradients for bets in
            # `self._embedding_layers`
            edl_embedding_grads = grads[non_embed_vars_n:]

            # Check that the number of bet equal to the number of gradients.
            # Please note that every embedding layer may have more than one
            # `bet_id_pair`.
            bet_number = 0
            for layer in self._embedding_layers:
                bet_number += len(layer.embedding_and_ids)
            if len(edl_embedding_grads) != bet_number:
                raise ValueError(
                    "elasticdl.layers.embedding related gradient number %d "
                    "does not match the number of its output tensor %d." %
                    (len(edl_embedding_grads), bet_number))

            grad_accum_iter = 0
            for layer in self._embedding_layers:
                g_values = None
                g_indices = None
                for _, ids in layer.embedding_and_ids:
                    grad = edl_embedding_grads[grad_accum_iter]
                    grad_accum_iter += 1
                    # ElasticDL embedding layer with Sparse Gradients
                    if isinstance(grad, tf.IndexedSlices):
                        grad = grad.values
                    if g_values is not None:
                        g_values = tf.concat([g_values, grad], axis=0)
                        g_indices = tf.concat([g_indices, ids], axis=0)
                    else:
                        g_values = grad
                        g_indices = ids

                req.gradient[layer.name].CopyFrom(
                    ndarray_to_tensor(g_values.numpy(),
                                      tuple(g_indices.numpy())))

        req.model_version = self._model_version
        res = self._stub.ReportGradient(req)
        return res.accepted, res.model_version
예제 #5
0
 def makeGrad():
     """ Make a ReportGradientRequest compatible with model"""
     req = elasticdl_pb2.ReportGradientRequest()
     req.gradient["x"].CopyFrom(
         ndarray_to_tensor(np.array([0.1], dtype=np.float32)))
     req.gradient["y"].CopyFrom(
         ndarray_to_tensor(np.array([0.03, 0.06], dtype=np.float32)))
     req.model_version = 1
     return req
예제 #6
0
 def report_evaluation_metrics(self, model_outputs, labels):
     """
     report evaluation metrics to ps, return (accepted, model_version)
     from rpc call.
     """
     req = elasticdl_pb2.ReportEvaluationMetricsRequest()
     if not isinstance(model_outputs, dict):
         model_outputs = {MetricsDictKey.MODEL_OUTPUT: model_outputs}
     for name, output in model_outputs.items():
         req.model_outputs[name].CopyFrom(ndarray_to_tensor(output.numpy()))
     req.labels.CopyFrom(ndarray_to_tensor(labels.numpy()))
     req.model_version = self._model_version
     res = self._stub.ReportEvaluationMetrics(req)
     return res.accepted, res.model_version
예제 #7
0
파일: worker.py 프로젝트: sorrycc/elasticdl
 def report_variable(self):
     """
     report variable to ps.
     """
     req = elasticdl_pb2.ReportVariableRequest()
     for v in self._model.trainable_variables:
         req.variable[v.name].CopyFrom(ndarray_to_tensor(v.numpy()))
     self._stub.ReportVariable(req)
예제 #8
0
    def testEvaluationService(self):
        with tempfile.TemporaryDirectory() as tempdir:
            chkp_dir = os.path.join(tempdir, "testEvaluationService")
            checkpoint_service = CheckpointService(chkp_dir, 5, 5, True)
            task_d = _TaskDispatcher(
                {"f1": (0, 10), "f2": (0, 10)},
                {"f1": (0, 10), "f2": (0, 10)},
                {},
                3,
                1,
            )

            # Evaluation metrics will not be accepted if no evaluation ongoing
            evaluation_service = EvaluationService(
                checkpoint_service, None, task_d, 10, 20, 0, False
            )
            evaluation_metrics = {
                "mse": ndarray_to_tensor(
                    np.array([100, 200], dtype=np.float32)
                )
            }
            self.assertFalse(
                evaluation_service.report_evaluation_metrics(
                    1, evaluation_metrics
                )
            )

            # No checkpoint available
            self.assertFalse(evaluation_service.try_to_create_new_job())

            master = MasterServicer(
                2,
                2,
                None,
                task_d,
                init_var=[],
                checkpoint_filename_for_init="",
                checkpoint_service=checkpoint_service,
                evaluation_service=evaluation_service,
            )
            master.set_model_var("x", np.array([1.0, 1.0], dtype=np.float32))

            # Add an evaluation task and we can start evaluation
            self.assertEqual(8, len(task_d._todo))
            evaluation_service.add_evaluation_task(0)
            self.assertEqual(16, len(task_d._todo))
            self.assertFalse(evaluation_service._eval_job.finished())

            for i in range(8):
                self.assertFalse(evaluation_service._eval_job.finished())
                evaluation_service.complete_task()
            self.assertTrue(evaluation_service._eval_job is None)
            self.assertFalse(evaluation_service.try_to_create_new_job())
예제 #9
0
파일: worker.py 프로젝트: sorrycc/elasticdl
 def report_evaluation_metrics(self, evaluation_metrics):
     """
     report evaluation metrics to ps, return (accepted, model_version)
     from rpc call.
     """
     req = elasticdl_pb2.ReportEvaluationMetricsRequest()
     for k, v in evaluation_metrics.items():
         v_np = v.numpy()
         # If scalar, convert to numpy 1D array with size 1
         if not v_np.shape:
             v_np = v_np.reshape(1)
         req.evaluation_metrics[k].CopyFrom(ndarray_to_tensor(v_np))
     req.model_version = self._model_version
     res = self._stub.ReportEvaluationMetrics(req)
     return res.accepted, res.model_version
예제 #10
0
 def _get_model_no_lock(self):
     pb_model = elasticdl_pb2.Model()
     pb_model.version = self._version
     for k, v in self._model.items():
         pb_model.param[k].CopyFrom(ndarray_to_tensor(v.numpy()))
     return pb_model
예제 #11
0
    def testReportGradient(self):
        def makeGrad():
            """ Make a ReportGradientRequest compatible with model"""
            req = elasticdl_pb2.ReportGradientRequest()
            req.gradient["x"].CopyFrom(
                ndarray_to_tensor(np.array([0.1], dtype=np.float32))
            )
            req.gradient["y"].CopyFrom(
                ndarray_to_tensor(np.array([0.03, 0.06], dtype=np.float32))
            )
            req.model_version = 1
            return req

        master = MasterServicer(
            3,
            3,
            tf.optimizers.SGD(0.1),
            None,
            init_var=[],
            checkpoint_filename_for_init="",
            checkpoint_service=CheckpointService("", 0, 0, False),
            evaluation_service=None,
        )
        master._version = 1
        master.set_model_var("x", np.array([2.0], dtype=np.float32))
        master.set_model_var("y", np.array([12.0, 13.0], dtype=np.float32))

        # Report a future version, should raise exception
        req = makeGrad()
        req.model_version = 2
        self.assertRaises(ValueError, master.ReportGradient, req, None)

        # Report an old version, should not be accepted
        req = makeGrad()
        req.model_version = 0
        res = master.ReportGradient(req, None)
        self.assertFalse(res.accepted)
        self.assertEqual(1, res.model_version)

        # Report a unknown gradient, should raise.
        req = makeGrad()
        req.gradient["z"].CopyFrom(
            ndarray_to_tensor(np.array([0.1], dtype=np.float32))
        )
        self.assertRaises(ValueError, master.ReportGradient, req, None)

        # Report an incompatible gradient, should raise.
        req = makeGrad()
        req.gradient["y"].CopyFrom(
            ndarray_to_tensor(np.array([0.1], dtype=np.float32))
        )
        self.assertRaises(ValueError, master.ReportGradient, req, None)

        # Report a current version, should be accepted.
        req = makeGrad()
        res = master.ReportGradient(req, None)
        self.assertTrue(res.accepted)
        self.assertEqual(1, res.model_version)

        # Report a current version with part of gradients, should be accepted.
        req = makeGrad()
        del req.gradient["y"]
        res = master.ReportGradient(req, None)
        self.assertTrue(res.accepted)
        self.assertEqual(1, res.model_version)
        # Gradient should be accumulated.
        np.testing.assert_array_equal(
            np.array([0.2], dtype=np.float32), master._gradient_sum["x"]
        )
        np.testing.assert_array_equal(
            np.array([0.03, 0.06], dtype=np.float32), master._gradient_sum["y"]
        )
        self.assertEqual(2, master._grad_n)

        # Report a current version, should be accepted, and a new version
        # created
        req = makeGrad()
        res = master.ReportGradient(req, None)
        self.assertTrue(res.accepted)
        self.assertEqual(2, res.model_version)
        self.assertFalse(master._gradient_sum)
        self.assertEqual(0, master._grad_n)
        np.testing.assert_array_equal(
            # [2] - 0.1 * [0.1]
            np.array([1.99], dtype=np.float32),
            master._model["x"].numpy(),
        )
        np.testing.assert_array_equal(
            # [12, 13] - 0.1 * [0.02, 0.04]
            np.array([11.998, 12.996], dtype=np.float32),
            master._model["y"].numpy(),
        )
예제 #12
0
 def verify(a):
     b = tensor_to_ndarray(ndarray_to_tensor(a))
     np.testing.assert_array_equal(a, b)
예제 #13
0
    def testEvaluationJob(self):
        model_version = 1
        total_tasks = 5
        latest_chkp_version = 2
        job = _EvaluationJob(_eval_metrics_fn(), model_version, total_tasks)
        self.assertEqual(0, job._completed_tasks)
        self.assertFalse(job.finished())
        self.assertFalse(self.ok_to_new_job(job, latest_chkp_version))

        # Now make 4 tasks finished
        for i in range(4):
            job.complete_task()
        self.assertEqual(4, job._completed_tasks)
        self.assertFalse(job.finished())
        self.assertFalse(self.ok_to_new_job(job, latest_chkp_version))

        # One more task finishes
        job.complete_task()
        self.assertEqual(5, job._completed_tasks)
        self.assertTrue(job.finished())
        self.assertTrue(self.ok_to_new_job(job, latest_chkp_version))

        # No new model checkpoint
        latest_chkp_version = job.model_version
        self.assertFalse(self.ok_to_new_job(job, latest_chkp_version))
        latest_chkp_version = job.model_version + 1
        self.assertTrue(self.ok_to_new_job(job, latest_chkp_version))

        # Start to report metrics
        evaluation_version = job.model_version + 1
        model_outputs = {
            MetricsDictKey.MODEL_OUTPUT:
            ndarray_to_tensor(np.array([[1], [6], [3]], dtype=np.float32))
        }
        labels = ndarray_to_tensor(np.array([[1], [0], [3]], dtype=np.float32))
        self.assertFalse(
            job.report_evaluation_metrics(evaluation_version, model_outputs,
                                          labels))
        evaluation_version = job.model_version
        self.assertTrue(
            job.report_evaluation_metrics(evaluation_version, model_outputs,
                                          labels))
        # One more
        self.assertTrue(
            job.report_evaluation_metrics(
                evaluation_version,
                {
                    MetricsDictKey.MODEL_OUTPUT:
                    ndarray_to_tensor(
                        np.array([[4], [5], [6], [7], [8]], dtype=np.float32))
                },
                ndarray_to_tensor(
                    np.array([[7], [8], [9], [10], [11]], dtype=np.float32)),
            ))
        expected_acc = 0.25
        evaluation_metrics = job.get_evaluation_summary()
        self.assertAlmostEqual(expected_acc,
                               evaluation_metrics.get("acc").numpy())
        self.assertAlmostEqual(expected_acc,
                               evaluation_metrics.get("acc_fn").numpy())
        self.assertAlmostEqual(10.125, evaluation_metrics.get("mse").numpy())