Exemplo n.º 1
0
 def save_latest_checkpoint(self, output_path):
     if self._checkpoint_service is None:
         self._checkpoint_service = CheckpointService(
             checkpoint_dir="",
             checkpoint_steps=1,
             keep_checkpoint_max=1,
             include_evaluation=False,
         )
     self._save_checkpoint(locking=False, is_eval_checkpoint=False)
     checkpoint_path = self._checkpoint_service.get_checkpoint_path(
         self._checkpoint_service.get_latest_checkpoint_version())
     copy_if_not_exists(checkpoint_path, output_path, is_dir=False)
Exemplo n.º 2
0
    def testInitFromCheckpoint(self):
        init_var = m["custom_model"]().trainable_variables
        with tempfile.TemporaryDirectory() as tempdir:
            chkp_dir = os.path.join(tempdir, "testInitFromCheckpoint")
            os.makedirs(chkp_dir)
            master = MasterServicer(
                2,
                3,
                None,
                None,
                init_var=init_var,
                checkpoint_filename_for_init="",
                checkpoint_service=CheckpointService(chkp_dir, 2, 3, False),
                evaluation_service=None,
            )
            req = elasticdl_pb2.GetModelRequest()
            req.method = elasticdl_pb2.MINIMUM
            req.version = 0
            model = master.GetModel(req, None)
            master._checkpoint_service.save(master._version, model, False)

            chkp_file = master._checkpoint_service.get_checkpoint_path(
                master._version
            )
            # Create variables from init_var, get init value from checkpoint.
            master2 = MasterServicer(
                2,
                3,
                None,
                None,
                init_var=init_var,
                checkpoint_filename_for_init=chkp_file,
                checkpoint_service=CheckpointService("", 0, 0, False),
                evaluation_service=None,
            )
            model2 = master2.GetModel(req, None)
            self.assertEqual(model, model2)
            # Create variables from checkpoint.
            master3 = MasterServicer(
                2,
                3,
                None,
                None,
                init_var=[],
                checkpoint_filename_for_init=chkp_file,
                checkpoint_service=CheckpointService("", 0, 0, False),
                evaluation_service=None,
            )
            model3 = master3.GetModel(req, None)
            self.assertEqual(model, model3)
Exemplo n.º 3
0
    def testNeedToCheckpoint(self):
        checkpointer = CheckpointService("", 0, 5, False)
        self.assertFalse(checkpointer.is_enabled())
        checkpointer._steps = 3
        self.assertTrue(checkpointer.is_enabled())

        self.assertFalse(checkpointer.need_to_checkpoint(1))
        self.assertFalse(checkpointer.need_to_checkpoint(2))
        self.assertTrue(checkpointer.need_to_checkpoint(3))
        self.assertFalse(checkpointer.need_to_checkpoint(4))
        self.assertFalse(checkpointer.need_to_checkpoint(5))
        self.assertTrue(checkpointer.need_to_checkpoint(6))
Exemplo n.º 4
0
    def testUserDefinedModel(self):
        master = MasterServicer(
            2,
            3,
            None,
            None,
            init_var=[],
            checkpoint_filename_for_init="",
            checkpoint_service=CheckpointService("", 0, 0, False),
            evaluation_service=None,
        )
        req = elasticdl_pb2.GetModelRequest()
        req.method = elasticdl_pb2.MINIMUM
        req.version = 0

        model_inst = SimpleModel()
        model_inst.build(SimpleModel.input_shapes())
        for variable in model_inst.trainable_variables:
            master.set_model_var(variable.name, variable.numpy())
        # Get version 0
        model = master.GetModel(req, None)
        self.assertEqual(0, model.version)
        self.assertEqual(
            [
                "dense_1/bias:0",
                "dense_1/kernel:0",
                "dense_2/bias:0",
                "dense_2/kernel:0",
            ],
            list(sorted(model.param.keys())),
        )
Exemplo n.º 5
0
    def testEvaluationService(self):
        with tempfile.TemporaryDirectory() as tempdir:
            chkp_dir = os.path.join(tempdir, "testEvaluationService")
            checkpoint_service = CheckpointService(chkp_dir, 5, 5, True)
            task_d = _TaskDispatcher(
                {"f1": (0, 10), "f2": (0, 10)},
                {"f1": (0, 10), "f2": (0, 10)},
                {},
                3,
                1,
            )

            # Evaluation metrics will not be accepted if no evaluation ongoing
            evaluation_service = EvaluationService(
                checkpoint_service, None, task_d, 10, 20, 0, False
            )
            evaluation_metrics = {
                "mse": ndarray_to_tensor(
                    np.array([100, 200], dtype=np.float32)
                )
            }
            self.assertFalse(
                evaluation_service.report_evaluation_metrics(
                    1, evaluation_metrics
                )
            )

            # No checkpoint available
            self.assertFalse(evaluation_service.try_to_create_new_job())

            master = MasterServicer(
                2,
                2,
                None,
                task_d,
                init_var=[],
                checkpoint_filename_for_init="",
                checkpoint_service=checkpoint_service,
                evaluation_service=evaluation_service,
            )
            master.set_model_var("x", np.array([1.0, 1.0], dtype=np.float32))

            # Add an evaluation task and we can start evaluation
            self.assertEqual(8, len(task_d._todo))
            evaluation_service.add_evaluation_task(0)
            self.assertEqual(16, len(task_d._todo))
            self.assertFalse(evaluation_service._eval_job.finished())

            for i in range(8):
                self.assertFalse(evaluation_service._eval_job.finished())
                evaluation_service.complete_task()
            self.assertTrue(evaluation_service._eval_job is None)
            self.assertFalse(evaluation_service.try_to_create_new_job())
Exemplo n.º 6
0
    def testSaveLoadCheckpoint(self):
        init_var = m["custom_model"]().trainable_variables
        with tempfile.TemporaryDirectory() as tempdir:
            chkp_dir = os.path.join(tempdir, "testSaveLoadCheckpoint")
            os.makedirs(chkp_dir)
            checkpointer = CheckpointService(chkp_dir, 3, 5, False)
            self.assertTrue(checkpointer.is_enabled())

            master = MasterServicer(
                2,
                3,
                None,
                None,
                init_var=init_var,
                checkpoint_filename_for_init="",
                checkpoint_service=checkpointer,
                evaluation_service=None,
            )

            req = elasticdl_pb2.GetModelRequest()
            req.method = elasticdl_pb2.MINIMUM
            req.version = 0
            model = master.GetModel(req, None)
            checkpointer.save(0, model, False)
            loaded_model = checkpointer.get_checkpoint_model(0)
            self.assertEqual(model.version, loaded_model.version)
            for var, loaded_var in zip(model.param, loaded_model.param):
                self.assertEqual(var, loaded_var)
Exemplo n.º 7
0
    def testReportTaskResult(self):
        task_d = _TaskDispatcher(
            {
                "shard_1": (0, 10),
                "shard_2": (0, 9)
            },
            {},
            {},
            records_per_task=3,
            num_epochs=2,
        )
        master = MasterServicer(
            3,
            3,
            None,
            task_d,
            init_var=[],
            checkpoint_filename_for_init="",
            checkpoint_service=CheckpointService("", 0, 0, False),
            evaluation_service=None,
        )

        # task to number of runs.
        tasks = defaultdict(int)
        while True:
            req = elasticdl_pb2.GetTaskRequest()
            req.worker_id = random.randint(1, 10)
            task = master.GetTask(req, None)
            if not task.shard_name:
                break
            self.assertEqual(task_d._doing[task.task_id][0], req.worker_id)
            task_key = (task.shard_name, task.start, task.end)
            tasks[task_key] += 1
            report = elasticdl_pb2.ReportTaskResultRequest()
            report.task_id = task.task_id
            if task.start == 0 and tasks[task_key] == 1:
                # Simulate error reports.
                report.err_message = "Worker error"
            master.ReportTaskResult(report, None)

        self.assertDictEqual(
            {
                ("shard_1", 0, 3): 3,
                ("shard_1", 3, 6): 2,
                ("shard_1", 6, 9): 2,
                ("shard_1", 9, 10): 2,
                ("shard_2", 0, 3): 3,
                ("shard_2", 3, 6): 2,
                ("shard_2", 6, 9): 2,
            },
            tasks,
        )
Exemplo n.º 8
0
    def _create_checkpoint_service(self, args):
        checkpoint_service = None
        if (args.checkpoint_steps
                or self.job_type == JobType.TRAINING_WITH_EVALUATION):
            self.logger.info("Creating checkpoint service")
            checkpoint_service = CheckpointService(
                args.checkpoint_dir,
                args.checkpoint_steps,
                args.keep_checkpoint_max,
                self.job_type == JobType.TRAINING_WITH_EVALUATION,
            )

        return checkpoint_service
Exemplo n.º 9
0
 def setUp(self):
     tf.keras.backend.clear_session()
     self.master = MasterServicer(
         2,
         3,
         None,
         None,
         init_var=[],
         checkpoint_filename_for_init="",
         checkpoint_service=CheckpointService("", 0, 0, False),
         evaluation_service=None,
     )
     self.master._version = 1
     self.model_handler = ModelHandler.get_model_handler(
         distribution_strategy="ParameterServerStrategy", stub=self.master)
Exemplo n.º 10
0
    def test_save_parameters_to_checkpoint_file(self):
        with tempfile.TemporaryDirectory() as tempdir:
            checkpoint_service = CheckpointService(
                checkpoint_dir=os.path.join(tempdir, "ckpt/"),
                checkpoint_steps=5,
                keep_checkpoint_max=3,
                include_evaluation=False,
            )
            pserver_servicer = PserverServicer(
                parameters=Parameters(),
                grads_to_wait=0,
                optimizer="optimizer",
                checkpoint_service=checkpoint_service,
                ps_id=0,
                num_ps_pods=1,
            )
            model_params = {
                "v0": tf.Variable([[1, 1, 1], [1, 1, 1]]),
                "v1": tf.Variable([[2, 2, 2], [2, 2, 2]]),
            }

            server_params = pserver_servicer._parameters
            for var_name, var_value in model_params.items():
                server_params.non_embedding_params[var_name] = var_value

            embedding_table = EmbeddingTable(name="embedding_0",
                                             dim=3,
                                             initializer="random_uniform")
            server_params.embedding_params["embedding_0"] = embedding_table
            server_params.set_embedding_param(
                name="embedding_0",
                indices=np.array([0, 1]),
                values=np.array([[1, 1, 1], [2, 2, 2]]),
            )

            for i in range(100):
                pserver_servicer._parameters.version += 1
                pserver_servicer._save_params_to_checkpoint_if_needed()

            self.assertEqual(len(os.listdir(checkpoint_service._directory)), 3)
            self.assertEqual(
                sorted(os.listdir(checkpoint_service._directory)),
                ["version-100", "version-90", "version-95"],
            )
            self.assertEqual(
                os.listdir(checkpoint_service._directory + "/version-100"),
                ["variables-0-of-1.chkpt"],
            )
Exemplo n.º 11
0
 def _init_checkpoint_service(self, args):
     if all([
             args.checkpoint_dir,
             args.checkpoint_steps,
             args.keep_checkpoint_max,
     ]):
         self.checkpoint_service = CheckpointService(
             args.checkpoint_dir,
             args.checkpoint_steps,
             args.keep_checkpoint_max,
             include_evaluation=False,
         )
     else:
         self.checkpoint_service = None
         self.logger.warning(
             "Invalid checkpoint config and no model will be saved")
Exemplo n.º 12
0
    def testGetEmptyTask(self):
        master = MasterServicer(
            2,
            3,
            None,
            _TaskDispatcher({}, {}, {}, records_per_task=3, num_epochs=2),
            init_var=[],
            checkpoint_filename_for_init="",
            checkpoint_service=CheckpointService("", 0, 0, False),
            evaluation_service=None,
        )

        req = elasticdl_pb2.GetTaskRequest()

        # No task yet, make sure the returned versions are as expected.
        req.worker_id = 1
        task = master.GetTask(req, None)
        self.assertEqual("", task.shard_file_name)
        self.assertEqual(0, task.model_version)

        master._version = 1
        task = master.GetTask(req, None)
        self.assertEqual("", task.shard_file_name)
        self.assertEqual(1, task.model_version)
Exemplo n.º 13
0
class MasterServicer(elasticdl_pb2_grpc.MasterServicer):
    """Master service implementation"""

    def __init__(
        self,
        grads_to_wait,
        minibatch_size,
        optimizer,
        task_d,
        *,
        init_var,
        checkpoint_filename_for_init,
        checkpoint_service,
        evaluation_service,
        embedding_service_endpoint=None,
        embedding_dims={},
        lr_staleness_modulation=False,
        use_async=False,
    ):
        # TODO: group params together into a single object.
        self._task_d = task_d
        self._lock = threading.Lock()
        self._gradient_sum = {}
        self._edl_embedding_gradients = {}
        self._gradient_sum_indexed = {}
        self._grad_to_wait = grads_to_wait
        self._grad_n = 0
        self._minibatch_size = minibatch_size
        self._use_async = use_async
        self._lr_staleness_modulation = lr_staleness_modulation

        # A <string, tf.ResourceVariable> map. We use tf.ResourceVariable
        # instead ndarray to avoid copying and conversion when calling
        # optimizer's apply_gradients() function.
        self._model = {}
        self._version = 0
        self._embedding_service_endpoint = embedding_service_endpoint
        self._init_model(checkpoint_filename_for_init, init_var)
        self._opt = self._init_optimizer(
            optimizer, embedding_service_endpoint, embedding_dims, use_async
        )

        self._checkpoint_service = checkpoint_service
        self._evaluation_service = evaluation_service
        if evaluation_service:
            evaluation_service.set_master_servicer(self)

    # TODO: Multiple tests are currently using the function `set_model_var` to
    # initialize self._model, where the initialization should be done via
    # servicer's constructor.
    def set_model_var(self, name, value):
        """Add or set model variable. Value should be a float32 ndarray"""
        if value.dtype != np.float32:
            raise ValueError("Value should be a float32 numpy array")
        self._model[name] = tf.Variable(
            value, name=MasterServicer.var_name_encode(name)
        )

    def _modulate_lr_if_needed(self, opt):
        if self._use_async and self._lr_staleness_modulation:
            self._lr_modulation = add_lr_modulation_to_optimizer(opt)
        else:
            self._lr_modulation = None

    def _init_model_from_var_list(self, var_list):
        for var in var_list:
            self.set_model_var(var.name, var.numpy())

    def _init_model_from_tensor_dict(self, tensor_dict):
        assert tensor_dict
        for name, val in tensor_dict.items():
            self.set_model_var(name, tensor_to_ndarray(val))

    def _init_model(self, checkpoint_filename_for_init, init_var):
        if checkpoint_filename_for_init:
            pb_model = load_from_checkpoint_file(checkpoint_filename_for_init)
            self._version = pb_model.version
            self._init_model_from_tensor_dict(pb_model.param)
        elif init_var:
            self._init_model_from_var_list(init_var)
        else:
            logger.info(
                "Model is not intialized. It will be "
                "initialized by the first update from "
                "the worker."
            )

    def _init_optimizer(
        self, opt, embedding_service_endpoint, embedding_dims, use_async
    ):
        # `embedding_service_endpoint` is not None means ElasticDL embedding
        # layers are used
        self._modulate_lr_if_needed(opt)
        if embedding_service_endpoint:
            return OptimizerWrapper(
                opt, embedding_service_endpoint, embedding_dims, use_async
            )
        return opt

    @staticmethod
    def var_name_encode(name):
        return name.replace(":", "-")

    def GetTask(self, request, _):
        res = elasticdl_pb2.Task()
        res.model_version = self._version
        res.minibatch_size = self._minibatch_size
        task_id, task = self._task_d.get(request.worker_id)
        if task:
            res.task_id = task_id
            res.shard_name = task.shard_name
            res.start = task.start
            res.end = task.end
            res.type = task.type
            # For evaluation task, it will use the fixed version model
            if task.type == elasticdl_pb2.EVALUATION:
                res.model_version = task.model_version
        elif not self._task_d.finished():
            # Not all tasks are finished, wait in case of new tasks later.
            res.type = elasticdl_pb2.WAIT
        return res

    def GetModel(self, request, _):
        if not self._use_async:
            self._validate_model_version(request.version)

        if (
            request.method == elasticdl_pb2.MINIMUM
            or request.version == self._version
        ):
            if self._use_async:
                res = self._get_model_no_lock()
            else:
                with self._lock:
                    res = self._get_model_no_lock()
            return res

        # Read from checkpoint for the fixed version model
        pb_model = elasticdl_pb2.Model()
        try:
            pb_model = self._checkpoint_service.get_checkpoint_model(
                request.version
            )
        except Exception:
            logger.error(
                "Failed to fetch checkpoint model for "
                "model version {}".format(request.version)
            )
        return pb_model

    def _update_model_version(self):
        assert self._lock.locked()
        self._version += 1

    def _update_edl_embedding_table(self, name_var_list):
        """
            Put updated embedding vectors' ids and values together
            and use EmbeddingService.update_embedding() to update
            embedding table in the distributed storage
        """
        keys = []
        embeddings = []
        for layer_name, unique_ids, embedding_var in name_var_list:
            keys.extend(
                [
                    Embedding.get_key([layer_name, i])
                    for i in unique_ids.numpy()
                ]
            )
            embeddings.extend([i for i in embedding_var.numpy()])

        if embeddings:
            EmbeddingService.update_embedding(
                keys=keys,
                embedding_vectors=embeddings,
                embedding_service_endpoint=self._embedding_service_endpoint,
            )

    def _update_model(self):
        grad_var = []

        # (grad, var) pairs excluding keras Embedding layer and
        # ElasticDL Embedding layer
        for k in self._gradient_sum:
            if not self._use_async:
                self._gradient_sum[k] = (
                    self._gradient_sum[k] / self._grad_to_wait
                )
            grad_var.append((self._gradient_sum[k], self._model[k]))

        # (grad, var) pair of Keras Embedding layer
        for k in self._gradient_sum_indexed:
            grad_var.append((self._gradient_sum_indexed[k], self._model[k]))

        # (grad, var) pair of ElasticDL Embedding layer
        if self._edl_embedding_gradients:
            for layer_name, grads in self._edl_embedding_gradients.items():
                grad_var.append((grads, layer_name))

        self._opt.apply_gradients(grad_var)

        # need the lock for model version update in async SGD
        if self._use_async:
            self._lock.acquire()
        self._update_model_version()
        self._update_evaluation()
        self._update_checkpoint()
        if self._use_async:
            self._lock.release()
        else:
            self._gradient_sum.clear()
            self._gradient_sum_indexed.clear()
            self._edl_embedding_gradients.clear()
            self._grad_n = 0

    def get_model_version(self):
        return self._version

    def _save_checkpoint(self, locking, is_eval_checkpoint):
        try:
            logger.info(
                "Saving checkpoint for model version %d" % self._version
            )
            if locking:
                self._lock.acquire()
            pb_model = self._get_model_no_lock()
            self._checkpoint_service.save(
                self._version, pb_model, is_eval_checkpoint
            )
            checkpoint_version = self._version
            if locking:
                self._lock.release()
            return checkpoint_version
        except Exception:
            logger.error(
                "Failed to save checkpoint file for model version %d"
                % self._version
            )

    def save_latest_checkpoint(self, output_path):
        if self._checkpoint_service is None:
            self._checkpoint_service = CheckpointService(
                checkpoint_dir="",
                checkpoint_steps=1,
                keep_checkpoint_max=1,
                include_evaluation=False,
            )
        self._save_checkpoint(locking=False, is_eval_checkpoint=False)
        checkpoint_path = self._checkpoint_service.get_checkpoint_path(
            self._checkpoint_service.get_latest_checkpoint_version()
        )
        copy_if_not_exists(checkpoint_path, output_path, is_dir=False)

    def _update_evaluation(self):
        if self._evaluation_service:
            self._evaluation_service.add_evaluation_task_if_needed(
                master_locking=False
            )

    def _update_checkpoint(self):
        if (
            self._checkpoint_service
            and self._checkpoint_service.need_to_checkpoint(self._version)
        ):
            self._save_checkpoint(locking=False, is_eval_checkpoint=False)

    def _get_model_no_lock(self):
        pb_model = elasticdl_pb2.Model()
        pb_model.version = self._version
        for k, v in self._model.items():
            pb_model.param[k].CopyFrom(ndarray_to_tensor(v.numpy()))
        return pb_model

    def _validate_model_version(self, request_model_version):
        if request_model_version > self._version:
            err_msg = (
                "Model version %d not available yet, "
                "current version: %d" % (request_model_version, self._version)
            )
            logger.warning(err_msg)
            raise ValueError(err_msg)
        return request_model_version == self._version

    def ReportVariable(self, request, _):
        with self._lock:
            if not self._model:
                self._init_model_from_tensor_dict(request.variable)
        return empty_pb2.Empty()

    def ReportGradient(self, request, _):
        model_version_valid = self._use_async or self._validate_model_version(
            request.model_version
        )

        res = elasticdl_pb2.ReportGradientResponse()
        if not model_version_valid:
            logger.warning(
                "Task result for outdated version %d dropped",
                request.model_version,
            )
            res.accepted = False
            res.model_version = self._version
            return res

        tmp = {}
        indexed_grads = {}
        edl_embedding_gradients = {}
        # Do sanity check before accumulating gradients.
        for k, v in request.gradient.items():
            if k not in self._model:
                if v.indices:
                    # grads of ElasticDL Embedding layer
                    # TODO: check arr.shape[1] = embedding_dim of this
                    # EdlEmbedding layer
                    arr = tensor_to_ndarray(v)
                    edl_embedding_gradients[k] = arr
                    continue
                else:
                    raise ValueError(
                        "Gradient key: %s is not part of model", k
                    )

            arr = tensor_to_ndarray(v)
            if isinstance(arr, tf.IndexedSlices):
                if arr.values.shape[1] != self._model[k].numpy().shape[1]:
                    raise ValueError(
                        "Gradient key: %s has incompatible "
                        "indexed slice dimension %d, expected %d"
                        % (
                            k,
                            arr.values.shape[1],
                            self._model[k].numpy().shape[1],
                        )
                    )

                max_index = tf.math.reduce_max(arr.indices).numpy()
                if max_index >= self._model[k].numpy().shape[0]:
                    raise ValueError(
                        "Gradient key: %s has wrong indices %d, "
                        "out of range %d"
                        % (k, max_index, self._model[k].numpy().shape[0] - 1)
                    )
                indexed_grads[k] = arr
            else:
                if arr.shape != self._model[k].numpy().shape:
                    raise ValueError(
                        "Gradient key: %s has incompatible dimension", k
                    )
                tmp[k] = arr

        if not self._use_async:
            self._lock.acquire()
        self._process_gradients(
            edl_embedding_gradients, indexed_grads, tmp, request.model_version
        )
        if not self._use_async:
            self._lock.release()

        res.accepted = True
        res.model_version = self._version
        return res

    def _process_gradients(
        self, edl_embedding_gradients, indexed_grads, grads, request_version
    ):
        if not self._use_async:
            # grads of ElasticDL Embedding layer
            for k, v in edl_embedding_gradients.items():
                if k in self._edl_embedding_gradients:
                    self._edl_embedding_gradients[k] = merge_indexed_slices(
                        self._edl_embedding_gradients[k], v
                    )
                else:
                    self._edl_embedding_gradients[k] = v

            # grads of Keras Embedding layer
            for k, v in indexed_grads.items():
                if k not in self._gradient_sum_indexed:
                    self._gradient_sum_indexed[k] = v
                else:
                    grads_s = self._gradient_sum_indexed[k]
                    self._gradient_sum_indexed[k] = merge_indexed_slices(
                        grads_s, v
                    )

            # other grads
            for k, v in grads.items():
                if not self._use_async and k in self._gradient_sum:
                    self._gradient_sum[k] = self._gradient_sum[k] + v
                else:
                    self._gradient_sum[k] = v
            self._grad_n += 1
        else:
            # TODO: do not accumulate gradients but apply directly.
            pass

        # staleness-aware learning rate modulation
        if self._lr_modulation:
            staleness = max(1, self._version - request_version)
            self._lr_modulation.set_multiplier(1.0 / staleness)
        if self._use_async or self._grad_n >= self._grad_to_wait:
            self._update_model()

    def ReportTaskResult(self, request, _):
        if request.err_message:
            logger.warning("Worker reported error: " + request.err_message)
            self._task_d.report(request.task_id, False)
        else:
            self._task_d.report(request.task_id, True)
        return empty_pb2.Empty()

    def ReportEvaluationMetrics(self, request, _):
        report_metrics = self._evaluation_service.report_evaluation_metrics(
            request.model_version, request.evaluation_metrics
        )
        res = elasticdl_pb2.ReportEvaluationMetricsResponse()
        res.model_version = self._version
        res.accepted = report_metrics
        return res
Exemplo n.º 14
0
    def testGetModel(self):
        master = MasterServicer(
            2,
            3,
            None,
            None,
            init_var=[],
            checkpoint_filename_for_init="",
            checkpoint_service=CheckpointService("", 0, 0, False),
            evaluation_service=None,
        )
        master.set_model_var("x", np.array([1.0, 1.0], dtype=np.float32))
        # Now master model is version 0
        self.assertEqual(0, master._version)

        # Get version 0 with minimum method
        req = elasticdl_pb2.GetModelRequest()
        req.version = 0
        req.method = elasticdl_pb2.MINIMUM
        model = master.GetModel(req, None)
        self.assertEqual(0, model.version)
        self.assertEqual(["x"], list(model.param.keys()))
        np.testing.assert_array_equal(
            np.array([1.0, 1.0]), tensor_to_ndarray(model.param["x"])
        )

        # Increase master model version to 1, but still request
        # version 0 with minimum method, we should get version 1
        master._version = 1
        master.set_model_var("x", np.array([2.0, 2.0], dtype=np.float32))
        master.set_model_var("y", np.array([12.0, 13.0], dtype=np.float32))
        model = master.GetModel(req, None)
        self.assertEqual(1, model.version)
        self.assertEqual(["x", "y"], list(sorted(model.param.keys())))
        np.testing.assert_array_equal(
            np.array([2.0, 2.0]), tensor_to_ndarray(model.param["x"])
        )
        np.testing.assert_array_equal(
            np.array([12.0, 13.0]), tensor_to_ndarray(model.param["y"])
        )

        # Try to get version 2, it should raise exception.
        req.version = 2
        self.assertRaises(ValueError, master.GetModel, req, None)

        # Get fixed version 1
        req.method = elasticdl_pb2.FIXED
        req.version = 1
        model = master.GetModel(req, None)
        self.assertEqual(1, model.version)
        self.assertEqual(["x", "y"], list(sorted(model.param.keys())))
        np.testing.assert_array_equal(
            np.array([2.0, 2.0]), tensor_to_ndarray(model.param["x"])
        )
        np.testing.assert_array_equal(
            np.array([12.0, 13.0]), tensor_to_ndarray(model.param["y"])
        )

        # Previous model unavailable due to no checkpoint
        req.version = 0
        model = master.GetModel(req, None)
        self.assertFalse(model.param)

        # Previous model available through checkpoint
        with tempfile.TemporaryDirectory() as tempdir:
            chk_dir = os.path.join(tempdir, "testGetModel")
            os.makedirs(chk_dir)
            req.version = master._version
            req.method = elasticdl_pb2.MINIMUM
            model = master.GetModel(req, None)
            master._checkpoint_service = CheckpointService(
                chk_dir, 2, 5, False
            )
            master._checkpoint_service.save(master._version, model, False)
            master._version = 2
            master.set_model_var("z", np.array([2.0, 2.0], dtype=np.float32))
            req.version = 1
            req.method = elasticdl_pb2.FIXED
            model = master.GetModel(req, None)
            self.assertEqual(1, model.version)
            self.assertEqual(["x", "y"], list(sorted(model.param.keys())))
            np.testing.assert_array_equal(
                np.array([2.0, 2.0]), tensor_to_ndarray(model.param["x"])
            )
            np.testing.assert_array_equal(
                np.array([12.0, 13.0]), tensor_to_ndarray(model.param["y"])
            )
Exemplo n.º 15
0
    def testReportGradient(self):
        def makeGrad():
            """ Make a ReportGradientRequest compatible with model"""
            req = elasticdl_pb2.ReportGradientRequest()
            req.gradient["x"].CopyFrom(
                ndarray_to_tensor(np.array([0.1], dtype=np.float32))
            )
            req.gradient["y"].CopyFrom(
                ndarray_to_tensor(np.array([0.03, 0.06], dtype=np.float32))
            )
            req.model_version = 1
            return req

        master = MasterServicer(
            3,
            3,
            tf.optimizers.SGD(0.1),
            None,
            init_var=[],
            checkpoint_filename_for_init="",
            checkpoint_service=CheckpointService("", 0, 0, False),
            evaluation_service=None,
        )
        master._version = 1
        master.set_model_var("x", np.array([2.0], dtype=np.float32))
        master.set_model_var("y", np.array([12.0, 13.0], dtype=np.float32))

        # Report a future version, should raise exception
        req = makeGrad()
        req.model_version = 2
        self.assertRaises(ValueError, master.ReportGradient, req, None)

        # Report an old version, should not be accepted
        req = makeGrad()
        req.model_version = 0
        res = master.ReportGradient(req, None)
        self.assertFalse(res.accepted)
        self.assertEqual(1, res.model_version)

        # Report a unknown gradient, should raise.
        req = makeGrad()
        req.gradient["z"].CopyFrom(
            ndarray_to_tensor(np.array([0.1], dtype=np.float32))
        )
        self.assertRaises(ValueError, master.ReportGradient, req, None)

        # Report an incompatible gradient, should raise.
        req = makeGrad()
        req.gradient["y"].CopyFrom(
            ndarray_to_tensor(np.array([0.1], dtype=np.float32))
        )
        self.assertRaises(ValueError, master.ReportGradient, req, None)

        # Report a current version, should be accepted.
        req = makeGrad()
        res = master.ReportGradient(req, None)
        self.assertTrue(res.accepted)
        self.assertEqual(1, res.model_version)

        # Report a current version with part of gradients, should be accepted.
        req = makeGrad()
        del req.gradient["y"]
        res = master.ReportGradient(req, None)
        self.assertTrue(res.accepted)
        self.assertEqual(1, res.model_version)
        # Gradient should be accumulated.
        np.testing.assert_array_equal(
            np.array([0.2], dtype=np.float32), master._gradient_sum["x"]
        )
        np.testing.assert_array_equal(
            np.array([0.03, 0.06], dtype=np.float32), master._gradient_sum["y"]
        )
        self.assertEqual(2, master._grad_n)

        # Report a current version, should be accepted, and a new version
        # created
        req = makeGrad()
        res = master.ReportGradient(req, None)
        self.assertTrue(res.accepted)
        self.assertEqual(2, res.model_version)
        self.assertFalse(master._gradient_sum)
        self.assertEqual(0, master._grad_n)
        np.testing.assert_array_equal(
            # [2] - 0.1 * [0.1]
            np.array([1.99], dtype=np.float32),
            master._model["x"].numpy(),
        )
        np.testing.assert_array_equal(
            # [12, 13] - 0.1 * [0.02, 0.04]
            np.array([11.998, 12.996], dtype=np.float32),
            master._model["y"].numpy(),
        )
Exemplo n.º 16
0
def distributed_train_and_evaluate(
    feature_shape,
    model_zoo_path,
    model_def,
    model_params="",
    eval_metrics_fn="eval_metrics_fn",
    training=True,
    dataset_name=DatasetName.IMAGE_DEFAULT,
    callback_classes=[],
    use_async=False,
    get_model_steps=1,
):
    """Runs distributed training and evaluation with a local master. Grpc
    calls are mocked by local master call.

    Args:
        feature_shape: The shape of model input.
        model_zoo_path: The directory that contains user-defined model files
            or a specific model file.
        model_def: The import path to the model definition function/class in
            the model zoo, e.g.  "cifar10_subclass.CustomModel".
        model_params: The dictionary of model parameters in a string that will
            be used to instantiate the model, e.g. "param1=1,param2=2".
        training: True for job type `TRAIN_WITH_EVALUATION`, False for
            job type `EVALUATION`.
        dataset_name: A dataset name from `DatasetName`.
        callback_classes: A List of callbacks that will be called at given
            stages of the training procedure.
        use_async: A python bool. True if using asynchronous updates.
        get_model_steps: Worker will perform `get_model` from the parameter
            server every this many steps.

    Returns:
        An integer indicating the model version after the distributed training
        and evaluation.
    """
    job_type = (JobType.TRAINING_WITH_EVALUATION
                if training else JobType.EVALUATION_ONLY)
    batch_size = 8 if dataset_name == DatasetName.IMAGENET else 16
    arguments = [
        "--worker_id",
        "1",
        "--job_type",
        job_type,
        "--minibatch_size",
        batch_size,
        "--model_zoo",
        model_zoo_path,
        "--model_def",
        model_def,
        "--model_params",
        model_params,
        "--get_model_steps",
        get_model_steps,
    ]
    args = parse_worker_args(arguments)
    worker = Worker(args)

    if dataset_name in [DatasetName.IMAGENET, DatasetName.FRAPPE]:
        record_num = batch_size
    else:
        record_num = 128
    shards = {
        create_recordio_file(record_num, dataset_name, feature_shape): (
            0,
            record_num,
        )
    }
    if training:
        training_shards = shards
        evaluation_shards = shards
    else:
        training_shards = {}
        evaluation_shards = shards
    task_d = _TaskDispatcher(
        training_shards,
        evaluation_shards,
        {},
        records_per_task=64,
        num_epochs=1,
    )

    model_module = load_module(get_module_file_path(model_zoo_path,
                                                    model_def)).__dict__
    checkpoint_service = CheckpointService("", 0, 0, True)
    if training:
        evaluation_service = EvaluationService(
            checkpoint_service,
            None,
            task_d,
            0,
            0,
            1,
            False,
            model_module[eval_metrics_fn],
        )
    else:
        evaluation_service = EvaluationService(
            checkpoint_service,
            None,
            task_d,
            0,
            0,
            0,
            True,
            model_module[eval_metrics_fn],
        )
    task_d.set_evaluation_service(evaluation_service)
    grads_to_wait = 1 if use_async else 2
    master = MasterServicer(
        grads_to_wait,
        batch_size,
        worker._opt_fn(),
        task_d,
        init_var=[],
        checkpoint_filename_for_init="",
        checkpoint_service=checkpoint_service,
        evaluation_service=evaluation_service,
        use_async=use_async,
    )
    callbacks = [
        callback_class(master, worker) for callback_class in callback_classes
    ]
    worker._stub = InProcessMaster(master, callbacks)

    for var in worker._model.trainable_variables:
        master.set_model_var(var.name, var.numpy())

    worker.run()

    req = elasticdl_pb2.GetTaskRequest()
    req.worker_id = 1
    task = master.GetTask(req, None)
    # No more task.
    if task.shard_name:
        raise RuntimeError(
            "There are some tasks unfinished after worker exits.")
    return master._version
Exemplo n.º 17
0
def main():
    args = parse_args()
    logger = get_logger("master", level=args.log_level.upper())

    # Master addr
    master_ip = os.getenv("MY_POD_IP", "localhost")
    master_addr = "%s:%d" % (master_ip, args.port)

    # Start TensorBoard service if requested
    if args.tensorboard_log_dir:
        logger.info(
            "Starting TensorBoard service with log directory %s",
            args.tensorboard_log_dir,
        )
        # Start TensorBoard CLI
        tb_service = TensorboardService(args.tensorboard_log_dir, master_ip)
        tb_service.start()
    else:
        tb_service = None

    # Start task queue
    logger.debug(
        "Starting task queue with training data directory %s, "
        "evaluation data directory %s, "
        "and prediction data directory %s",
        args.training_data_dir,
        args.evaluation_data_dir,
        args.prediction_data_dir,
    )
    task_d = _make_task_dispatcher(
        args.training_data_dir,
        args.evaluation_data_dir,
        args.prediction_data_dir,
        args.records_per_task,
        args.num_epochs,
    )
    model_module = load_module(
        get_module_file_path(args.model_zoo, args.model_def)).__dict__
    model_inst = load_model_from_module(args.model_def, model_module,
                                        args.model_params)
    optimizer = model_module[args.optimizer]()

    if all((
            args.training_data_dir,
            args.evaluation_data_dir,
            args.evaluation_throttle_secs or args.evaluation_steps,
    )):
        job_type = JobType.TRAINING_WITH_EVALUATION
    elif all((
            args.evaluation_data_dir,
            not args.training_data_dir,
            not args.prediction_data_dir,
    )):
        job_type = JobType.EVALUATION_ONLY
    elif all((
            args.prediction_data_dir,
            not args.evaluation_data_dir,
            not args.training_data_dir,
    )):
        job_type = JobType.PREDICTION_ONLY
    else:
        job_type = JobType.TRAINING_ONLY

    # Initialize checkpoint service
    if args.checkpoint_steps or job_type == JobType.TRAINING_WITH_EVALUATION:
        logger.info("Starting checkpoint service")
        checkpoint_service = CheckpointService(
            args.checkpoint_dir,
            args.checkpoint_steps,
            args.keep_checkpoint_max,
            job_type == JobType.TRAINING_WITH_EVALUATION,
        )
    else:
        checkpoint_service = None

    # Initialize evaluation service
    evaluation_service = None
    if (job_type == JobType.TRAINING_WITH_EVALUATION
            or job_type == JobType.EVALUATION_ONLY):
        logger.info(
            "Starting evaluation service with throttle seconds %d "
            " and evaluation steps %d",
            args.evaluation_throttle_secs,
            args.evaluation_steps,
        )
        evaluation_service = EvaluationService(
            checkpoint_service,
            tb_service,
            task_d,
            args.evaluation_start_delay_secs,
            args.evaluation_throttle_secs,
            args.evaluation_steps,
            job_type == JobType.EVALUATION_ONLY,
        )
        evaluation_service.start()
        task_d.set_evaluation_service(evaluation_service)

    embedding_service_endpoint = None
    embedding_dims = {}
    # Search for embedding layers in the model,
    # if found, initialize embedding service
    layers = find_layer(model_inst, Embedding)
    if layers:
        embedding_service = EmbeddingService()
        embedding_service_endpoint = embedding_service.start_embedding_service(
            job_name=args.job_name,
            image_name=args.worker_image,
            namespace=args.namespace,
            resource_request=args.master_resource_request,
            resource_limit=args.master_resource_limit,
            pod_priority=args.worker_pod_priority,
            volume=args.volume,
            image_pull_policy=args.image_pull_policy,
            restart_policy=args.restart_policy,
            cluster_spec=args.cluster_spec,
        )
        logger.info("Embedding service start succeeded. The endpoint is %s." %
                    str(embedding_service_endpoint))
        embedding_dims = dict([(layer.name, layer.output_dim)
                               for layer in layers])

    # The master service
    logger.info("Starting master service")
    server = grpc.server(
        futures.ThreadPoolExecutor(max_workers=64),
        options=[
            ("grpc.max_send_message_length", GRPC.MAX_SEND_MESSAGE_LENGTH),
            (
                "grpc.max_receive_message_length",
                GRPC.MAX_RECEIVE_MESSAGE_LENGTH,
            ),
        ],
    )
    master_servicer = MasterServicer(
        args.grads_to_wait,
        args.minibatch_size,
        optimizer,
        task_d,
        init_var=model_inst.trainable_variables if model_inst.built else [],
        embedding_dims=embedding_dims,
        checkpoint_filename_for_init=args.checkpoint_filename_for_init,
        checkpoint_service=checkpoint_service,
        evaluation_service=evaluation_service,
        embedding_service_endpoint=embedding_service_endpoint,
        lr_staleness_modulation=args.lr_staleness_modulation,
        use_async=args.use_async,
    )
    elasticdl_pb2_grpc.add_MasterServicer_to_server(master_servicer, server)
    server.add_insecure_port("[::]:{}".format(args.port))
    server.start()
    logger.info("Server started at port: %d", args.port)

    worker_manager = None
    if args.num_workers:
        assert args.worker_image, "Worker image cannot be empty"

        worker_command = ["python"]
        worker_args = [
            "-m",
            "elasticdl.python.worker.main",
            "--model_zoo",
            args.model_zoo,
            "--master_addr",
            master_addr,
            "--log_level",
            args.log_level,
            "--dataset_fn",
            args.dataset_fn,
            "--loss",
            args.loss,
            "--optimizer",
            args.optimizer,
            "--eval_metrics_fn",
            args.eval_metrics_fn,
            "--model_def",
            args.model_def,
            "--job_type",
            job_type,
            "--minibatch_size",
            str(args.minibatch_size),
            "--embedding_service_endpoint",
            str(embedding_service_endpoint),
            "--get_model_steps",
            str(args.get_model_steps),
        ]

        env_dict = parse_envs(args.envs)
        env = []
        for key in env_dict:
            env.append(V1EnvVar(name=key, value=env_dict[key]))

        worker_manager = WorkerManager(
            task_d,
            job_name=args.job_name,
            image_name=args.worker_image,
            command=worker_command,
            args=worker_args,
            namespace=args.namespace,
            num_workers=args.num_workers,
            worker_resource_request=args.worker_resource_request,
            worker_resource_limit=args.worker_resource_limit,
            pod_priority=args.worker_pod_priority,
            volume=args.volume,
            image_pull_policy=args.image_pull_policy,
            restart_policy=args.restart_policy,
            cluster_spec=args.cluster_spec,
            envs=env,
        )
        worker_manager.update_status(WorkerManagerStatus.PENDING)
        logger.info("Launching %d workers", args.num_workers)
        worker_manager.start_workers()
        worker_manager.update_status(WorkerManagerStatus.RUNNING)

    # Start TensorBoard k8s Service if requested
    if tb_service:
        TensorBoardClient(
            job_name=args.job_name,
            image_name=args.worker_image,
            namespace=args.namespace,
        ).start_tensorboard_service()

    try:
        while True:
            if task_d.finished():
                if worker_manager:
                    worker_manager.update_status(WorkerManagerStatus.FINISHED)
                if args.output:
                    master_servicer.save_latest_checkpoint(args.output)
                break
            time.sleep(30)
    except KeyboardInterrupt:
        logger.warning("Server stopping")

    if evaluation_service:
        logger.info("Stopping evaluation service")
        evaluation_service.stop()

    logger.info("Stopping RPC server")
    server.stop(0)

    # Keep TensorBoard running when all the tasks are finished
    if tb_service:
        logger.info(
            "All tasks finished. Keeping TensorBoard service running...")
        while True:
            if tb_service.is_active():
                time.sleep(10)
            else:
                logger.warning("Unable to keep TensorBoard running. "
                               "It has already terminated")
                break
    logger.info("Master stopped")
Exemplo n.º 18
0
class MasterServicer(elasticdl_pb2_grpc.MasterServicer):
    """Master service implementation"""
    def __init__(
        self,
        grads_to_wait,
        minibatch_size,
        optimizer,
        task_d,
        *,
        init_var,
        checkpoint_filename_for_init,
        checkpoint_service,
        evaluation_service,
        lr_staleness_modulation=False,
        use_async=False,
    ):
        # TODO: group params together into a single object.
        self._task_d = task_d
        self._lock = threading.Lock()
        self._gradient_sum = {}
        self._edl_embedding_gradients = {}
        self._gradient_sum_indexed = {}
        self._grad_to_wait = grads_to_wait
        self._grad_n = 0
        self._minibatch_size = minibatch_size
        self._use_async = use_async
        self._lr_staleness_modulation = lr_staleness_modulation

        # A <string, tf.ResourceVariable> map. We use tf.ResourceVariable
        # instead of ndarray to avoid copying and conversion when calling
        # optimizer's apply_gradients() function.
        self._model = {}
        self._version = 0
        self._init_model(checkpoint_filename_for_init, init_var)
        self._opt = self._init_optimizer(optimizer, use_async)

        self._checkpoint_service = checkpoint_service
        self._evaluation_service = evaluation_service
        if evaluation_service:
            evaluation_service.set_master_servicer(self)

    # TODO: Multiple tests are currently using the function `set_model_var` to
    # initialize self._model, where the initialization should be done via
    # servicer's constructor.
    def set_model_var(self, name, value):
        """Add or set model variable. Value should be a float32 ndarray"""
        if value.dtype != np.float32:
            raise ValueError("Value should be a float32 numpy array")
        self._model[name] = tf.Variable(
            value, name=MasterServicer.var_name_encode(name))

    def _modulate_lr_if_needed(self, opt):
        if self._use_async and self._lr_staleness_modulation:
            self._lr_modulation = add_lr_modulation_to_optimizer(opt)
        else:
            self._lr_modulation = None

    def _init_model_from_var_list(self, var_list):
        for var in var_list:
            self.set_model_var(var.name, var.numpy())

    def _init_model_from_tensor_pb_list(self, tensor_pb_list):
        assert tensor_pb_list
        for pb in tensor_pb_list:
            self.set_model_var(pb.name, tensor_pb_to_ndarray(pb))

    def _init_model(self, checkpoint_filename_for_init, init_var):
        if checkpoint_filename_for_init:
            pb_model = load_from_checkpoint_file(checkpoint_filename_for_init)
            self._version = pb_model.version
            self._init_model_from_tensor_pb_list(pb_model.param)
        elif init_var:
            self._init_model_from_var_list(init_var)
        else:
            logger.info("Model is not intialized. It will be "
                        "initialized by the first update from "
                        "the worker.")

    # TODO: remove this function
    def _init_optimizer(self, opt, use_async):
        self._modulate_lr_if_needed(opt)
        if opt:
            return OptimizerWrapper(opt, use_async)
        return opt

    @staticmethod
    def var_name_encode(name):
        return name.replace(":", "-")

    def GetTask(self, request, _):
        res = elasticdl_pb2.Task()
        res.model_version = self._version
        res.minibatch_size = self._minibatch_size
        if request.task_type == elasticdl_pb2.EVALUATION:
            task_id, task = self._task_d.get_eval_task(request.worker_id)
        else:
            task_id, task = self._task_d.get(request.worker_id)

        if task:
            res.task_id = task_id
            res.shard_name = task.shard_name
            res.start = task.start
            res.end = task.end
            res.type = task.type
            for k, v in task.extended_config.items():
                res.extended_config[k] = v

            # For evaluation task, it will use the fixed version model
            if task.type == elasticdl_pb2.EVALUATION:
                res.model_version = task.model_version
        elif (not self._task_d.finished()) or (
                self._task_d.invoke_deferred_callback()):
            # If the todo and doing tasks are not empty,
            # Otherwise if the callback list is not empty,
            # we are trying to pop and invoke the callback.
            # Then the master tells the worker to wait
            # in case of new tasks later.
            res.type = elasticdl_pb2.WAIT

        return res

    def GetModel(self, request, _):
        if not self._use_async:
            self._validate_model_version(request.version)

        if (request.method == elasticdl_pb2.MINIMUM
                or request.version == self._version):
            if self._use_async:
                res = self._get_model_no_lock()
            else:
                with self._lock:
                    res = self._get_model_no_lock()
            return res

        # Read from checkpoint for the fixed version model
        pb_model = elasticdl_pb2.Model()
        try:
            pb_model = self._checkpoint_service.get_checkpoint_model(
                request.version)
        except Exception:
            logger.error("Failed to fetch checkpoint model for "
                         "model version {}".format(request.version))
        return pb_model

    def get_model_version(self):
        return self._version

    def _save_checkpoint(self, locking, is_eval_checkpoint):
        try:
            logger.info("Saving checkpoint for model version %d" %
                        self._version)
            if locking:
                self._lock.acquire()
            pb_model = self._get_model_no_lock()
            self._checkpoint_service.save(self._version, pb_model,
                                          is_eval_checkpoint)
            checkpoint_version = self._version
            if locking:
                self._lock.release()
            return checkpoint_version
        except Exception:
            logger.error(
                "Failed to save checkpoint file for model version %d" %
                self._version)

    def save_latest_checkpoint(self, output_path):
        if self._checkpoint_service is None:
            self._checkpoint_service = CheckpointService(
                checkpoint_dir="",
                checkpoint_steps=1,
                keep_checkpoint_max=1,
                include_evaluation=False,
            )
        self._save_checkpoint(locking=False, is_eval_checkpoint=False)
        checkpoint_path = self._checkpoint_service.get_checkpoint_path(
            self._checkpoint_service.get_latest_checkpoint_version())
        copy_if_not_exists(checkpoint_path, output_path, is_dir=False)

    def _update_evaluation(self):
        if self._evaluation_service:
            self._evaluation_service.add_evaluation_task_if_needed(
                master_locking=False, model_version=self._version)

    def _update_checkpoint(self):
        if (self._checkpoint_service and
                self._checkpoint_service.need_to_checkpoint(self._version)):
            self._save_checkpoint(locking=False, is_eval_checkpoint=False)

    def _get_model_no_lock(self):
        pb_model = elasticdl_pb2.Model()
        pb_model.version = self._version
        for k, v in self._model.items():
            emplace_tensor_pb_from_ndarray(pb_model.param, v.numpy(), name=k)
        return pb_model

    def _validate_model_version(self, request_model_version):
        if request_model_version > self._version:
            err_msg = ("Model version %d not available yet, "
                       "current version: %d" %
                       (request_model_version, self._version))
            logger.warning(err_msg)
            raise ValueError(err_msg)
        return request_model_version == self._version

    def ReportVariable(self, request, _):
        with self._lock:
            if not self._model:
                self._init_model_from_tensor_pb_list(request.variable)
        return empty_pb2.Empty()

    def ReportTaskResult(self, request, _):
        if request.err_message:
            logger.warning("Worker reported error: " + request.err_message)
            self._task_d.report(request, False)
        else:
            self._task_d.report(request, True)
        return empty_pb2.Empty()

    def ReportEvaluationMetrics(self, request, _):
        report_metrics = self._evaluation_service.report_evaluation_metrics(
            request.model_version, request.model_outputs, request.labels)
        res = elasticdl_pb2.ReportEvaluationMetricsResponse()
        res.model_version = self._version
        res.accepted = report_metrics
        return res

    def ReportVersion(self, request, _):
        if self._evaluation_service:
            self._evaluation_service.add_evaluation_task_if_needed(
                master_locking=False, model_version=request.model_version)
        return empty_pb2.Empty()
Exemplo n.º 19
0
    def testMaxCheckpointVersions(self):
        with tempfile.TemporaryDirectory() as tempdir:
            chkp_dir = os.path.join(tempdir, "testMaxCheckpointVersions")
            os.makedirs(chkp_dir)
            # Save checkpoints every 2 steps, and keep 5 checkpoints at most
            checkpointer = CheckpointService(chkp_dir, 2, 5, False)
            self.assertTrue(checkpointer.is_enabled())

            batch_size = 2
            # Launch the training
            arguments = [
                "--worker_id",
                1,
                "--job_type",
                JobType.TRAINING_ONLY,
                "--minibatch_size",
                batch_size,
                "--model_zoo",
                _model_zoo_path,
                "--model_def",
                "test_module.custom_model",
            ]
            args = parse_worker_args(arguments)
            worker = Worker(args)

            filename = create_recordio_file(128, DatasetName.TEST_MODULE, 1)
            task_d = _TaskDispatcher({filename: (0, 128)}, {}, {},
                                     records_per_task=64,
                                     num_epochs=1)
            master = MasterServicer(
                2,
                batch_size,
                worker._opt_fn(),
                task_d,
                init_var=worker._model.trainable_variables,
                checkpoint_filename_for_init="",
                checkpoint_service=checkpointer,
                evaluation_service=None,
            )

            worker._stub = InProcessMaster(master)
            worker.run()

            # We should have 5 checkpoints when the training finishes
            checkpoint_files = sorted(os.listdir(checkpointer._directory))
            self.assertEqual(
                checkpoint_files,
                [
                    "model_v24.chkpt",
                    "model_v26.chkpt",
                    "model_v28.chkpt",
                    "model_v30.chkpt",
                    "model_v32.chkpt",
                ],
            )
            # Latest version should be 32
            self.assertEqual(32, checkpointer.get_latest_checkpoint_version())
            # Check all checkpoints
            for version in [24, 26, 28, 30, 32]:
                model = checkpointer.get_checkpoint_model(version)
                self.assertEqual(version, model.version)
            # Checkpoint not found
            self.assertRaisesRegex(
                RuntimeError,
                "Failed to read model checkpoint from file",
                checkpointer.get_checkpoint_model,
                100,
            )
Exemplo n.º 20
0
    def distributed_train_and_evaluate(
        self,
        feature_shape,
        model_def,
        model_params="",
        training=True,
        dataset="",
    ):
        """
        Run distributed training and evaluation with a local master.
        grpc calls are mocked by local master call.
        """
        job_type = (JobType.TRAINING_ONLY
                    if training else JobType.EVALUATION_ONLY)
        batch_size = 16
        worker = Worker(
            1,
            job_type,
            batch_size,
            _model_zoo_path,
            model_def=model_def,
            model_params=model_params,
            channel=None,
        )

        if dataset == "imagenet":
            batch_size = 8
            shards = {create_imagenet_recordio_file(8, feature_shape): (0, 8)}
        elif dataset == "frappe":
            shards = {
                create_frappe_recordio_file(16, feature_shape, 5383): (0, 16)
            }
        else:
            shards = {create_recordio_file(128, feature_shape): (0, 128)}

        if training:
            training_shards = shards
            evaluation_shards = shards
        else:
            training_shards = {}
            evaluation_shards = shards
        task_d = _TaskDispatcher(
            training_shards,
            evaluation_shards,
            {},
            records_per_task=64,
            num_epochs=1,
        )
        # Initialize checkpoint service
        checkpoint_service = CheckpointService("", 0, 0, True)
        if training:
            evaluation_service = EvaluationService(checkpoint_service, None,
                                                   task_d, 0, 0, 1, False)
        else:
            evaluation_service = EvaluationService(checkpoint_service, None,
                                                   task_d, 0, 0, 0, True)
        task_d.set_evaluation_service(evaluation_service)
        # The master service
        master = MasterServicer(
            2,
            batch_size,
            worker._opt_fn(),
            task_d,
            init_var=[],
            checkpoint_filename_for_init="",
            checkpoint_service=checkpoint_service,
            evaluation_service=evaluation_service,
        )
        worker._stub = InProcessMaster(master)

        for var in worker._model.trainable_variables:
            master.set_model_var(var.name, var.numpy())

        worker.run()

        req = elasticdl_pb2.GetTaskRequest()
        req.worker_id = 1
        task = master.GetTask(req, None)
        # No more task.
        self.assertTrue(not task.shard_name)