def testEvaluationOnly(self):
        task_d = _TaskDispatcher({}, {"f1": (0, 10), "f2": (0, 10)}, {}, 3, 1)

        evaluation_service = EvaluationService(
            None, task_d, 0, 0, 0, True, _eval_metrics_fn
        )
        task_d.set_evaluation_service(evaluation_service)

        _ = MasterServicer(
            2, task_d, evaluation_service=evaluation_service, master=None,
        )

        self.assertEqual(8, len(task_d._eval_todo))
        for i in range(8):
            self.assertFalse(evaluation_service._eval_job.finished())
            evaluation_service.complete_task()
        self.assertTrue(evaluation_service._eval_job.finished())
示例#2
0
    def testReportTaskResult(self):
        task_d = _TaskDispatcher(
            {
                "shard_1": (0, 10),
                "shard_2": (0, 9)
            },
            {},
            {},
            records_per_task=3,
            num_epochs=2,
        )
        master = MasterServicer(3,
                                task_d,
                                evaluation_service=None,
                                master=None)

        # task to number of runs.
        tasks = defaultdict(int)
        while True:
            req = elasticdl_pb2.GetTaskRequest()
            req.worker_id = random.randint(1, 10)
            task = master.get_task(req, None)
            if not task.shard_name:
                break
            self.assertEqual(task_d._doing[task.task_id][0], req.worker_id)
            task_key = (task.shard_name, task.start, task.end)
            tasks[task_key] += 1
            report = elasticdl_pb2.ReportTaskResultRequest()
            report.task_id = task.task_id
            if task.start == 0 and tasks[task_key] == 1:
                # Simulate error reports.
                report.err_message = "Worker error"
            master.report_task_result(report, None)

        self.assertDictEqual(
            {
                ("shard_1", 0, 3): 3,
                ("shard_1", 3, 6): 2,
                ("shard_1", 6, 9): 2,
                ("shard_1", 9, 10): 2,
                ("shard_2", 0, 3): 3,
                ("shard_2", 3, 6): 2,
                ("shard_2", 6, 9): 2,
            },
            tasks,
        )
    def testEvaluationService(self):
        task_d = _TaskDispatcher(
            {
                "f1": (0, 10),
                "f2": (0, 10)
            },
            {
                "f1": (0, 10),
                "f2": (0, 10)
            },
            {},
            3,
            1,
        )

        # Evaluation metrics will not be accepted if no evaluation ongoing
        evaluation_service = EvaluationService(
            None,
            task_d,
            10,
            20,
            0,
            False,
            _eval_metrics_fn,
        )

        _ = MasterServicer(
            2,
            task_d,
            evaluation_service=evaluation_service,
        )

        # No checkpoint available
        self.assertFalse(evaluation_service.try_to_create_new_job())

        # Add an evaluation task and we can start evaluation
        self.assertEqual(8, len(task_d._todo))
        evaluation_service.add_evaluation_task(False)
        self.assertEqual(8, len(task_d._eval_todo))
        self.assertFalse(evaluation_service._eval_job.finished())

        for i in range(8):
            self.assertFalse(evaluation_service._eval_job.finished())
            evaluation_service.complete_task()
        self.assertTrue(evaluation_service._eval_job is None)
        self.assertFalse(evaluation_service.try_to_create_new_job())
    def testNeedEvaluation(self):
        task_d = _TaskDispatcher(
            {"f1": (0, 10), "f2": (0, 10)},
            {"f1": (0, 10), "f2": (0, 10)},
            {},
            3,
            1,
        )

        evaluation_service = EvaluationService(
            None, task_d, 10, 0, 10, False, _eval_metrics_fn,
        )

        # Should add evaluation task and create eval job
        evaluation_service.add_evaluation_task_if_needed(
            master_locking=False, model_version=10
        )
        self.assertTrue(evaluation_service._eval_job is not None)
        self.assertEqual(evaluation_service._eval_checkpoint_versions, [])

        # Should ignore because version 10 is in the eval list
        evaluation_service.add_evaluation_task_if_needed(
            master_locking=False, model_version=10
        )
        self.assertEqual(evaluation_service._eval_checkpoint_versions, [])

        # Should append version 20 to the eval list
        evaluation_service.add_evaluation_task_if_needed(
            master_locking=False, model_version=20
        )
        self.assertEqual(evaluation_service._eval_checkpoint_versions, [20])

        # Should ignore version 10 because version 20 is already in eval list
        evaluation_service.add_evaluation_task_if_needed(
            master_locking=False, model_version=10
        )
        self.assertEqual(evaluation_service._eval_checkpoint_versions, [20])

        # Should append version 30 to the eval list
        evaluation_service.add_evaluation_task_if_needed(
            master_locking=False, model_version=30
        )
        self.assertEqual(
            evaluation_service._eval_checkpoint_versions, [20, 30]
        )
示例#5
0
    def testGetEmptyTask(self):
        master = MasterServicer(
            3,
            _TaskDispatcher({}, {}, {}, records_per_task=3, num_epochs=2),
            evaluation_service=None,
        )

        req = elasticdl_pb2.GetTaskRequest()

        # No task yet, make sure the returned versions are as expected.
        req.worker_id = 1
        task = master.get_task(req, None)
        self.assertEqual("", task.shard_name)
        self.assertEqual(0, task.model_version)

        master._version = 1
        task = master.get_task(req, None)
        self.assertEqual("", task.shard_name)
        self.assertEqual(1, task.model_version)
示例#6
0
def _make_task_dispatcher(
    training_data_dir,
    evaluation_data_dir,
    prediction_data_dir,
    records_per_task,
    num_epochs,
):
    # TODO: Support any subclasses of `AbstractDataReader`
    # and support passing specified parameters to the constructor
    prediction_f_records = RecordIODataReader(
        data_dir=prediction_data_dir).create_shards()

    return _TaskDispatcher(
        RecordIODataReader(data_dir=training_data_dir).create_shards(),
        RecordIODataReader(data_dir=evaluation_data_dir).create_shards(),
        prediction_f_records,
        records_per_task,
        # Only generate prediction tasks for 1 epoch
        1 if prediction_f_records else num_epochs,
    )
示例#7
0
    def test_create_tasks_with_non_zero_start_ind(self):
        task_d = _TaskDispatcher({"f1": (0, 10), "f2": (10, 10)}, {}, {}, 3, 1)

        all_tasks = [
            ("f1", 0, 3, elasticdl_pb2.TRAINING, -1),
            ("f1", 3, 6, elasticdl_pb2.TRAINING, -1),
            ("f1", 6, 9, elasticdl_pb2.TRAINING, -1),
            ("f1", 9, 10, elasticdl_pb2.TRAINING, -1),
            ("f2", 10, 13, elasticdl_pb2.TRAINING, -1),
            ("f2", 13, 16, elasticdl_pb2.TRAINING, -1),
            ("f2", 16, 19, elasticdl_pb2.TRAINING, -1),
            ("f2", 19, 20, elasticdl_pb2.TRAINING, -1),
        ]

        # get all tasks out, each worker is assigned 2 tasks.
        got_tasks = [task_d.get(i // 2) for i in range(8)]

        # verify ids ranges from 1 to 8
        self.assertEqual(list(range(1, 9)), [k for k, _ in got_tasks])

        # verify tasks
        self.assertEqual(sorted([v._info() for _, v in got_tasks]), all_tasks)
示例#8
0
    def test_create_delete_worker_pod(self):
        task_d = _TaskDispatcher({"f": (0, 10)}, {}, {}, 1, 1)
        task_d.recover_tasks = MagicMock()
        instance_manager = InstanceManager(
            task_d,
            job_name="test-create-worker-pod-%d-%d" %
            (int(time.time()), random.randint(1, 101)),
            image_name="ubuntu:18.04",
            worker_command=["/bin/bash"],
            worker_args=["-c", "echo"],
            namespace="default",
            num_workers=2,
        )

        instance_manager.start_workers()
        max_check_num = 20
        for _ in range(max_check_num):
            time.sleep(3)
            counters = instance_manager.get_pod_counter(pod_type="worker")
            if counters["Succeeded"] == 2:
                break

        instance_manager._not_created_worker_id = [2]
        instance_manager._worker_pod_priority[2] = None
        instance_manager._process_worker()
        for _ in range(max_check_num):
            time.sleep(3)
            counters = instance_manager.get_pod_counter(pod_type="worker")
            if counters["Succeeded"] == 3:
                break

        instance_manager.stop_relaunch_and_remove_pods(pod_type="worker")
        for _ in range(max_check_num):
            time.sleep(3)
            counters = instance_manager.get_pod_counter(pod_type="worker")
            if not counters:
                break
        self.assertFalse(counters)
    def test_get_worker_addrs(self):
        task_d = _TaskDispatcher({"f": (0, 10)}, {}, {}, 1, 1)
        instance_manager = InstanceManager(
            task_d,
            job_name="test-create-worker-pod-%d-%d" %
            (int(time.time()), random.randint(1, 101)),
            image_name="ubuntu:18.04",
            worker_command=["/bin/bash"],
            worker_args=["-c", "sleep 5 #"],
            namespace="default",
            num_workers=3,
        )

        instance_manager.start_workers()
        max_check_num = 20
        for _ in range(max_check_num):
            time.sleep(3)
            counters = instance_manager.get_worker_counter()
            if counters["Running"]:
                worker_addrs = instance_manager._get_alive_worker_addr()
                self.assertEqual(len(worker_addrs), counters["Running"])

        instance_manager.stop_relaunch_and_remove_workers()
示例#10
0
    def test_epoch(self):
        task_d = _TaskDispatcher({"f1": (0, 10), "f2": (0, 10)}, {}, {}, 3, 2)

        epoch_tasks = [
            ("f1", 0, 3, elasticdl_pb2.TRAINING, -1),
            ("f1", 3, 6, elasticdl_pb2.TRAINING, -1),
            ("f1", 6, 9, elasticdl_pb2.TRAINING, -1),
            ("f1", 9, 10, elasticdl_pb2.TRAINING, -1),
            ("f2", 0, 3, elasticdl_pb2.TRAINING, -1),
            ("f2", 3, 6, elasticdl_pb2.TRAINING, -1),
            ("f2", 6, 9, elasticdl_pb2.TRAINING, -1),
            ("f2", 9, 10, elasticdl_pb2.TRAINING, -1),
        ]

        # get first epoch tasks
        got_tasks = [task_d.get(i // 2) for i in range(8)]
        self.assertEqual(sorted([v._info() for _, v in got_tasks]),
                         epoch_tasks)

        # get second epoch tasks
        got_tasks = [task_d.get(i // 2) for i in range(8)]
        self.assertEqual(sorted([v._info() for _, v in got_tasks]),
                         epoch_tasks)
示例#11
0
    def testFailedWorkerPod(self):
        """
        Start a pod running a python program destined to fail with
        restart_policy="Never" to test failed_worker_count
        """
        task_d = _TaskDispatcher({"f": 10}, {}, {}, 1, 1)
        task_d.recover_tasks = MagicMock()
        worker_manager = WorkerManager(
            task_d,
            job_name="test-failed-worker-pod-%d-%d"
            % (int(time.time()), random.randint(1, 101)),
            image_name="gcr.io/google-samples/hello-app:1.0",
            command=["badcommand"],
            args=["badargs"],
            namespace="default",
            num_workers=3,
            restart_policy="Never",
        )
        worker_manager.start_workers()
        max_check_num = 20
        for _ in range(max_check_num):
            time.sleep(3)
            counters = worker_manager.get_counters()
            print(counters)
            if counters["Failed"] == 3:
                break

        worker_manager.stop_relaunch_and_remove_workers()
        for _ in range(max_check_num):
            time.sleep(3)
            counters = worker_manager.get_counters()
            print(counters)
            if not counters:
                break
        task_d.recover_tasks.assert_has_calls(
            [call(0), call(1), call(2)], any_order=True
        )
示例#12
0
    def testEvaluationOnly(self):
        task_d = _TaskDispatcher({}, {"f1": (0, 10), "f2": (0, 10)}, {}, 3, 1)

        evaluation_service = EvaluationService(None, None, task_d, 0, 0, 0,
                                               True, _eval_metrics_fn)
        task_d.set_evaluation_service(evaluation_service)

        master = MasterServicer(
            2,
            2,
            None,
            task_d,
            init_var=[],
            checkpoint_filename_for_init="",
            checkpoint_service=None,
            evaluation_service=evaluation_service,
        )
        master.set_model_var("x", np.array([1.0, 1.0], dtype=np.float32))

        self.assertEqual(8, len(task_d._todo))
        for i in range(8):
            self.assertFalse(evaluation_service._eval_job.finished())
            evaluation_service.complete_task()
        self.assertTrue(evaluation_service._eval_job.finished())
示例#13
0
    def testGetEmptyTask(self):
        master = MasterServicer(
            2,
            3,
            None,
            _TaskDispatcher({}, {}, {}, records_per_task=3, num_epochs=2),
            init_var=[],
            checkpoint_filename_for_init="",
            checkpoint_service=CheckpointService("", 0, 0, False),
            evaluation_service=None,
        )

        req = elasticdl_pb2.GetTaskRequest()

        # No task yet, make sure the returned versions are as expected.
        req.worker_id = 1
        task = master.GetTask(req, None)
        self.assertEqual("", task.shard_file_name)
        self.assertEqual(0, task.model_version)

        master._version = 1
        task = master.GetTask(req, None)
        self.assertEqual("", task.shard_file_name)
        self.assertEqual(1, task.model_version)
示例#14
0
def distributed_train_and_evaluate(
    feature_shape,
    model_zoo_path,
    model_def,
    model_params="",
    eval_metrics_fn="eval_metrics_fn",
    training=True,
    dataset_name=DatasetName.IMAGE_DEFAULT,
    callback_classes=[],
    use_async=False,
    get_model_steps=1,
):
    """Runs distributed training and evaluation with a local master. Grpc
    calls are mocked by local master call.

    Args:
        feature_shape: The shape of model input.
        model_zoo_path: The directory that contains user-defined model files
            or a specific model file.
        model_def: The import path to the model definition function/class in
            the model zoo, e.g.  "cifar10_subclass.CustomModel".
        model_params: The dictionary of model parameters in a string that will
            be used to instantiate the model, e.g. "param1=1,param2=2".
        training: True for job type `TRAIN_WITH_EVALUATION`, False for
            job type `EVALUATION`.
        dataset_name: A dataset name from `DatasetName`.
        callback_classes: A List of callbacks that will be called at given
            stages of the training procedure.
        use_async: A python bool. True if using asynchronous updates.
        get_model_steps: Worker will perform `get_model` from the parameter
            server every this many steps.

    Returns:
        An integer indicating the model version after the distributed training
        and evaluation.
    """
    job_type = (JobType.TRAINING_WITH_EVALUATION
                if training else JobType.EVALUATION_ONLY)
    batch_size = 8 if dataset_name == DatasetName.IMAGENET else 16
    arguments = [
        "--worker_id",
        "1",
        "--job_type",
        job_type,
        "--minibatch_size",
        batch_size,
        "--model_zoo",
        model_zoo_path,
        "--model_def",
        model_def,
        "--model_params",
        model_params,
        "--get_model_steps",
        get_model_steps,
    ]
    args = parse_worker_args(arguments)
    worker = Worker(args)

    if dataset_name in [DatasetName.IMAGENET, DatasetName.FRAPPE]:
        record_num = batch_size
    else:
        record_num = 128
    shards = {
        create_recordio_file(record_num, dataset_name, feature_shape): (
            0,
            record_num,
        )
    }
    if training:
        training_shards = shards
        evaluation_shards = shards
    else:
        training_shards = {}
        evaluation_shards = shards
    task_d = _TaskDispatcher(
        training_shards,
        evaluation_shards,
        {},
        records_per_task=64,
        num_epochs=1,
    )

    model_module = load_module(get_module_file_path(model_zoo_path,
                                                    model_def)).__dict__
    checkpoint_service = CheckpointService("", 0, 0, True)
    if training:
        evaluation_service = EvaluationService(
            checkpoint_service,
            None,
            task_d,
            0,
            0,
            1,
            False,
            model_module[eval_metrics_fn],
        )
    else:
        evaluation_service = EvaluationService(
            checkpoint_service,
            None,
            task_d,
            0,
            0,
            0,
            True,
            model_module[eval_metrics_fn],
        )
    task_d.set_evaluation_service(evaluation_service)
    grads_to_wait = 1 if use_async else 2
    master = MasterServicer(
        grads_to_wait,
        batch_size,
        worker._opt_fn(),
        task_d,
        init_var=[],
        checkpoint_filename_for_init="",
        checkpoint_service=checkpoint_service,
        evaluation_service=evaluation_service,
        use_async=use_async,
    )
    callbacks = [
        callback_class(master, worker) for callback_class in callback_classes
    ]
    worker._stub = InProcessMaster(master, callbacks)

    for var in worker._model.trainable_variables:
        master.set_model_var(var.name, var.numpy())

    worker.run()

    req = elasticdl_pb2.GetTaskRequest()
    req.worker_id = 1
    task = master.GetTask(req, None)
    # No more task.
    if task.shard_name:
        raise RuntimeError(
            "There are some tasks unfinished after worker exits.")
    return master._version
示例#15
0
    def testEvaluationService(self):
        with tempfile.TemporaryDirectory() as tempdir:
            chkp_dir = os.path.join(tempdir, "testEvaluationService")
            checkpoint_service = CheckpointService(chkp_dir, 5, 5, True)
            task_d = _TaskDispatcher(
                {
                    "f1": (0, 10),
                    "f2": (0, 10)
                },
                {
                    "f1": (0, 10),
                    "f2": (0, 10)
                },
                {},
                3,
                1,
            )

            # Evaluation metrics will not be accepted if no evaluation ongoing
            evaluation_service = EvaluationService(
                checkpoint_service,
                None,
                task_d,
                10,
                20,
                0,
                False,
                _eval_metrics_fn,
            )
            model_outputs = {
                MetricsDictKey.MODEL_OUTPUT:
                ndarray_to_tensor(np.array([1, 6, 3], dtype=np.float32))
            }
            labels = ndarray_to_tensor(np.array([1, 0, 3], dtype=np.float32))

            self.assertFalse(
                evaluation_service.report_evaluation_metrics(
                    1, model_outputs, labels))

            # No checkpoint available
            self.assertFalse(evaluation_service.try_to_create_new_job())

            master = MasterServicer(
                2,
                2,
                None,
                task_d,
                init_var=[],
                checkpoint_filename_for_init="",
                checkpoint_service=checkpoint_service,
                evaluation_service=evaluation_service,
            )
            master.set_model_var("x", np.array([1.0, 1.0], dtype=np.float32))

            # Add an evaluation task and we can start evaluation
            self.assertEqual(8, len(task_d._todo))
            evaluation_service.add_evaluation_task(False)
            self.assertEqual(16, len(task_d._todo))
            self.assertFalse(evaluation_service._eval_job.finished())

            for i in range(8):
                self.assertFalse(evaluation_service._eval_job.finished())
                evaluation_service.complete_task()
            self.assertTrue(evaluation_service._eval_job is None)
            self.assertFalse(evaluation_service.try_to_create_new_job())
示例#16
0
def distributed_train_and_evaluate(
    feature_shape,
    model_zoo_path,
    model_def,
    model_params="",
    eval_metrics_fn="eval_metrics_fn",
    loss="loss",
    training=True,
    dataset_name=DatasetName.IMAGE_DEFAULT,
    use_async=False,
    get_model_steps=1,
    ps_channels=None,
    pservers=None,
    distribution_strategy=DistributionStrategy.PARAMETER_SERVER,
):
    """Runs distributed training and evaluation with a local master. Grpc
    calls are mocked by local master call.

    Args:
        feature_shape: The shape of model input.
        model_zoo_path: The directory that contains user-defined model files
            or a specific model file.
        model_def: The import path to the model definition function/class in
            the model zoo, e.g.  "cifar10_subclass.CustomModel".
        model_params: The dictionary of model parameters in a string that will
            be used to instantiate the model, e.g. "param1=1,param2=2".
        eval_metrics_fn: The name of the evaluation metrics function defined
            in the model file.
        loss: The name of the loss function defined in the model file.
        training: True for job type `TRAIN_WITH_EVALUATION`, False for
            job type `EVALUATION`.
        dataset_name: A dataset name from `DatasetName`.
        use_async: A bool. True if using asynchronous updates.
        get_model_steps: Worker will perform `get_model` from the parameter
            server every this many steps.
        ps_channels: A channel list to all parameter server pods.
        pservers: A list of parameter server pods.
        distribution_strategy: The distribution startegy used by workers, e.g.
            DistributionStrategy.PARAMETER_SERVER or
            DistributionStrategy.AllreduceStrategy.

    Returns:
        An integer indicating the model version after the distributed training
        and evaluation.
    """
    job_type = (JobType.TRAINING_WITH_EVALUATION
                if training else JobType.EVALUATION_ONLY)
    evaluation_steps = 1 if job_type == JobType.TRAINING_WITH_EVALUATION else 0
    batch_size = 8 if dataset_name == DatasetName.IMAGENET else 16
    pservers = pservers or []
    ps_channels = ps_channels or []

    model_module = load_module(get_module_file_path(model_zoo_path,
                                                    model_def)).__dict__

    worker_arguments = [
        "--worker_id",
        "1",
        "--job_type",
        job_type,
        "--minibatch_size",
        batch_size,
        "--model_zoo",
        model_zoo_path,
        "--model_def",
        model_def,
        "--model_params",
        model_params,
        "--loss",
        loss,
        "--get_model_steps",
        get_model_steps,
        "--distribution_strategy",
        distribution_strategy,
    ]
    args = parse_worker_args(worker_arguments)

    if dataset_name in [DatasetName.IMAGENET, DatasetName.FRAPPE]:
        record_num = batch_size
    else:
        record_num = 128
    shards = {
        create_recordio_file(record_num, dataset_name, feature_shape): (
            0,
            record_num,
        )
    }
    if training:
        training_shards = shards
        evaluation_shards = shards
    else:
        training_shards = {}
        evaluation_shards = shards
    task_d = _TaskDispatcher(
        training_shards,
        evaluation_shards,
        {},
        records_per_task=64,
        num_epochs=1,
    )

    if training:
        evaluation_service = EvaluationService(
            None,
            task_d,
            0,
            0,
            evaluation_steps,
            False,
            model_module[eval_metrics_fn],
        )
    else:
        evaluation_service = EvaluationService(
            None,
            task_d,
            0,
            0,
            evaluation_steps,
            True,
            model_module[eval_metrics_fn],
        )
    task_d.set_evaluation_service(evaluation_service)

    master = Mock(
        task_d=task_d,
        instance_manager=None,
        distribution_strategy=None,
    )

    def master_creator():
        return MasterServicer(
            batch_size,
            evaluation_service=evaluation_service,
            master=master,
        )

    svc, port = _server(master_creator)
    mc = MasterClient(build_channel("localhost:%d" % port), 1)
    worker = Worker(args, master_client=mc, ps_client=PSClient(ps_channels))

    for pservicer in pservers:
        # FIXME(yancey1989): decouple pserver and master client
        pservicer._master_stub = mc

    worker.run()

    task = mc.get_task()
    # stop the master servicer
    svc.stop(0)
    # No more task.
    if task.shard_name:
        raise RuntimeError(
            "There are some tasks unfinished after worker exits.")
    return task.model_version
示例#17
0
    def testMaxCheckpointVersions(self):
        with tempfile.TemporaryDirectory() as tempdir:
            chkp_dir = os.path.join(tempdir, "testMaxCheckpointVersions")
            os.makedirs(chkp_dir)
            # Save checkpoints every 2 steps, and keep 5 checkpoints at most
            checkpointer = CheckpointService(chkp_dir, 2, 5, False)
            self.assertTrue(checkpointer.is_enabled())

            batch_size = 2
            # Launch the training
            arguments = [
                "--worker_id",
                1,
                "--job_type",
                JobType.TRAINING_ONLY,
                "--minibatch_size",
                batch_size,
                "--model_zoo",
                _model_zoo_path,
                "--model_def",
                "test_module.custom_model",
            ]
            args = parse_worker_args(arguments)
            worker = Worker(args)

            filename = create_recordio_file(128, DatasetName.TEST_MODULE, 1)
            task_d = _TaskDispatcher({filename: (0, 128)}, {}, {},
                                     records_per_task=64,
                                     num_epochs=1)
            master = MasterServicer(
                2,
                batch_size,
                worker._opt_fn(),
                task_d,
                init_var=worker._model.trainable_variables,
                checkpoint_filename_for_init="",
                checkpoint_service=checkpointer,
                evaluation_service=None,
            )

            worker._stub = InProcessMaster(master)
            worker.run()

            # We should have 5 checkpoints when the training finishes
            checkpoint_files = sorted(os.listdir(checkpointer._directory))
            self.assertEqual(
                checkpoint_files,
                [
                    "model_v24.chkpt",
                    "model_v26.chkpt",
                    "model_v28.chkpt",
                    "model_v30.chkpt",
                    "model_v32.chkpt",
                ],
            )
            # Latest version should be 32
            self.assertEqual(32, checkpointer.get_latest_checkpoint_version())
            # Check all checkpoints
            for version in [24, 26, 28, 30, 32]:
                model = checkpointer.get_checkpoint_model(version)
                self.assertEqual(version, model.version)
            # Checkpoint not found
            self.assertRaisesRegex(
                RuntimeError,
                "Failed to read model checkpoint from file",
                checkpointer.get_checkpoint_model,
                100,
            )
示例#18
0
    def distributed_train_and_evaluate(
        self,
        feature_shape,
        model_def,
        model_params="",
        training=True,
        dataset="",
    ):
        """
        Run distributed training and evaluation with a local master.
        grpc calls are mocked by local master call.
        """
        job_type = (JobType.TRAINING_ONLY
                    if training else JobType.EVALUATION_ONLY)
        batch_size = 16
        worker = Worker(
            1,
            job_type,
            batch_size,
            _model_zoo_path,
            model_def=model_def,
            model_params=model_params,
            channel=None,
        )

        if dataset == "imagenet":
            batch_size = 8
            shards = {create_imagenet_recordio_file(8, feature_shape): (0, 8)}
        elif dataset == "frappe":
            shards = {
                create_frappe_recordio_file(16, feature_shape, 5383): (0, 16)
            }
        else:
            shards = {create_recordio_file(128, feature_shape): (0, 128)}

        if training:
            training_shards = shards
            evaluation_shards = shards
        else:
            training_shards = {}
            evaluation_shards = shards
        task_d = _TaskDispatcher(
            training_shards,
            evaluation_shards,
            {},
            records_per_task=64,
            num_epochs=1,
        )
        # Initialize checkpoint service
        checkpoint_service = CheckpointService("", 0, 0, True)
        if training:
            evaluation_service = EvaluationService(checkpoint_service, None,
                                                   task_d, 0, 0, 1, False)
        else:
            evaluation_service = EvaluationService(checkpoint_service, None,
                                                   task_d, 0, 0, 0, True)
        task_d.set_evaluation_service(evaluation_service)
        # The master service
        master = MasterServicer(
            2,
            batch_size,
            worker._opt_fn(),
            task_d,
            init_var=[],
            checkpoint_filename_for_init="",
            checkpoint_service=checkpoint_service,
            evaluation_service=evaluation_service,
        )
        worker._stub = InProcessMaster(master)

        for var in worker._model.trainable_variables:
            master.set_model_var(var.name, var.numpy())

        worker.run()

        req = elasticdl_pb2.GetTaskRequest()
        req.worker_id = 1
        task = master.GetTask(req, None)
        # No more task.
        self.assertTrue(not task.shard_name)
示例#19
0
    def distributed_train_and_evaluate(self, training=True):
        """
        Run distributed training and evaluation with a local master.
        grpc calls are mocked by local master call.
        """
        class _Master(InProcessMaster):
            def ReportGradient(self, req):
                if 2 < self._m._version < 80:
                    # For testing of retrain when gradient not accepted.
                    # Increase master version to reject the gradient.
                    self._m._version += 1
                return self._m.ReportGradient(req, None)

            def ReportEvaluationMetrics(self, req):
                if 2 < self._m._version < 80:
                    # Testing of evaluation retries. Increase the master
                    # version so the evaluation metrics will not be accepted.
                    self._m._version += 1
                return self._m.ReportEvaluationMetrics(req, None)

        job_type = (JobType.TRAINING_ONLY
                    if training else JobType.EVALUATION_ONLY)
        batch_size = 16
        worker = Worker(
            1,
            job_type,
            batch_size,
            _model_zoo_path,
            model_def="test_module.custom_model",
            channel=None,
        )

        shards = {create_recordio_file(128): 128}
        if training:
            training_shards = shards
            evaluation_shards = {}
        else:
            training_shards = {}
            evaluation_shards = shards
        task_d = _TaskDispatcher(
            training_shards,
            evaluation_shards,
            {},
            records_per_task=64,
            num_epochs=1,
        )
        if not training:
            evaluation_service = EvaluationService(None, None, task_d, 0, 0, 0,
                                                   True)
            task_d.set_evaluation_service(evaluation_service)
        else:
            evaluation_service = None
        master = MasterServicer(
            2,
            batch_size,
            worker._opt_fn(),
            task_d,
            init_var=[],
            checkpoint_filename_for_init="",
            checkpoint_service=None,
            evaluation_service=evaluation_service,
        )
        worker._stub = _Master(master)

        for var in worker._model.trainable_variables:
            master.set_model_var(var.name, var.numpy())

        worker.run()

        req = elasticdl_pb2.GetTaskRequest()
        req.worker_id = 1
        task = master.GetTask(req, None)
        # No more task.
        self.assertTrue(not task.shard_name)
示例#20
0
def distributed_train_and_evaluate(
    feature_shape,
    model_zoo_path,
    model_def,
    model_params="",
    eval_metrics_fn="eval_metrics_fn",
    loss="loss",
    training=True,
    dataset_name=DatasetName.IMAGE_DEFAULT,
    callback_classes=[],
    use_async=False,
    get_model_steps=1,
    ps_channels=None,
    pservers=None,
    distribution_strategy=DistributionStrategy.PARAMETER_SERVER,
):
    """Runs distributed training and evaluation with a local master. Grpc
    calls are mocked by local master call.

    Args:
        feature_shape: The shape of model input.
        model_zoo_path: The directory that contains user-defined model files
            or a specific model file.
        model_def: The import path to the model definition function/class in
            the model zoo, e.g.  "cifar10_subclass.CustomModel".
        model_params: The dictionary of model parameters in a string that will
            be used to instantiate the model, e.g. "param1=1,param2=2".
        eval_metrics_fn: The name of the evaluation metrics function defined
            in the model file.
        loss: The name of the loss function defined in the model file.
        training: True for job type `TRAIN_WITH_EVALUATION`, False for
            job type `EVALUATION`.
        dataset_name: A dataset name from `DatasetName`.
        callback_classes: A List of callbacks that will be called at given
            stages of the training procedure.
        use_async: A bool. True if using asynchronous updates.
        get_model_steps: Worker will perform `get_model` from the parameter
            server every this many steps.
        ps_channels: A channel list to all parameter server pods.
        pservers: A list of parameter server pods.
        distribution_strategy: The distribution startegy used by workers, e.g.
            DistributionStrategy.PARAMETER_SERVER or
            DistributionStrategy.AllreduceStrategy.

    Returns:
        An integer indicating the model version after the distributed training
        and evaluation.
    """
    job_type = (JobType.TRAINING_WITH_EVALUATION
                if training else JobType.EVALUATION_ONLY)
    evaluation_steps = 1 if job_type == JobType.TRAINING_WITH_EVALUATION else 0
    batch_size = 8 if dataset_name == DatasetName.IMAGENET else 16
    pservers = pservers or []
    ps_channels = ps_channels or []

    model_module = load_module(get_module_file_path(model_zoo_path,
                                                    model_def)).__dict__

    for channel in ps_channels:
        grpc.channel_ready_future(channel).result()
    worker_arguments = [
        "--worker_id",
        "1",
        "--job_type",
        job_type,
        "--minibatch_size",
        batch_size,
        "--model_zoo",
        model_zoo_path,
        "--model_def",
        model_def,
        "--model_params",
        model_params,
        "--loss",
        loss,
        "--get_model_steps",
        get_model_steps,
        "--distribution_strategy",
        distribution_strategy,
    ]
    args = parse_worker_args(worker_arguments)
    worker = Worker(args, ps_channels=ps_channels)

    if dataset_name in [DatasetName.IMAGENET, DatasetName.FRAPPE]:
        record_num = batch_size
    else:
        record_num = 128
    shards = {
        create_recordio_file(record_num, dataset_name, feature_shape): (
            0,
            record_num,
        )
    }
    if training:
        training_shards = shards
        evaluation_shards = shards
    else:
        training_shards = {}
        evaluation_shards = shards
    task_d = _TaskDispatcher(
        training_shards,
        evaluation_shards,
        {},
        records_per_task=64,
        num_epochs=1,
    )

    if training:
        evaluation_service = EvaluationService(
            None,
            task_d,
            0,
            0,
            evaluation_steps,
            False,
            model_module[eval_metrics_fn],
        )
    else:
        evaluation_service = EvaluationService(
            None,
            task_d,
            0,
            0,
            evaluation_steps,
            True,
            model_module[eval_metrics_fn],
        )
    task_d.set_evaluation_service(evaluation_service)

    master = MasterServicer(
        batch_size,
        task_d,
        evaluation_service=evaluation_service,
    )
    callbacks = [
        callback_class(master, worker) for callback_class in callback_classes
    ]

    in_process_master = InProcessMaster(master, callbacks)
    worker._stub = in_process_master
    for pservicer in pservers:
        pservicer._master_stub = in_process_master

    worker.run()

    req = elasticdl_pb2.GetTaskRequest()
    req.worker_id = 1
    task = master.get_task(req, None)
    # No more task.
    if task.shard_name:
        raise RuntimeError(
            "There are some tasks unfinished after worker exits.")
    return master._version
示例#21
0
    def distributed_train_and_evaluate(
        self,
        training=True,
        callback_classes=[],
        use_async=False,
        grads_to_wait=2,
        get_model_steps=1,
    ):
        """
        Run distributed training and evaluation with a local master.
        grpc calls are mocked by local master call.
        """

        if use_async and grads_to_wait > 1:
            raise ValueError(
                "grads_to_wait should be 1 when using asynchronous SGD."
            )

        job_type = (
            JobType.TRAINING_ONLY if training else JobType.EVALUATION_ONLY
        )
        batch_size = 16
        worker = Worker(
            1,
            job_type,
            batch_size,
            _model_zoo_path,
            model_def="test_module.custom_model",
            channel=None,
            get_model_steps=get_model_steps,
        )

        shards = {create_recordio_file(128): (0, 128)}
        if training:
            training_shards = shards
            evaluation_shards = {}
        else:
            training_shards = {}
            evaluation_shards = shards
        task_d = _TaskDispatcher(
            training_shards,
            evaluation_shards,
            {},
            records_per_task=64,
            num_epochs=1,
        )
        if not training:
            evaluation_service = EvaluationService(
                None, None, task_d, 0, 0, 0, True
            )
            task_d.set_evaluation_service(evaluation_service)
        else:
            evaluation_service = None
        master = MasterServicer(
            grads_to_wait,
            batch_size,
            worker._opt_fn(),
            task_d,
            init_var=[],
            checkpoint_filename_for_init="",
            checkpoint_service=None,
            evaluation_service=evaluation_service,
            use_async=use_async,
        )
        callbacks = [
            callback_class(master, worker, self)
            for callback_class in callback_classes
        ]
        worker._stub = InProcessMaster(master, callbacks)

        for var in worker._model.trainable_variables:
            master.set_model_var(var.name, var.numpy())

        worker.run()

        req = elasticdl_pb2.GetTaskRequest()
        req.worker_id = 1
        task = master.GetTask(req, None)
        # No more task.
        self.assertTrue(not task.shard_name)