def testEvaluationOnly(self): task_d = _TaskDispatcher({}, {"f1": (0, 10), "f2": (0, 10)}, {}, 3, 1) evaluation_service = EvaluationService( None, None, task_d, 0, 0, 0, True ) task_d.set_evaluation_service(evaluation_service) master = MasterServicer( 2, 2, None, task_d, init_var=[], checkpoint_filename_for_init="", checkpoint_service=None, evaluation_service=evaluation_service, ) master.set_model_var("x", np.array([1.0, 1.0], dtype=np.float32)) self.assertEqual(8, len(task_d._todo)) for i in range(8): self.assertFalse(evaluation_service._eval_job.finished()) evaluation_service.complete_task() self.assertTrue(evaluation_service._eval_job.finished())
def testUserDefinedModel(self): master = MasterServicer( 2, 3, None, None, init_var=[], checkpoint_filename_for_init="", checkpoint_service=CheckpointService("", 0, 0, False), evaluation_service=None, ) req = elasticdl_pb2.GetModelRequest() req.method = elasticdl_pb2.MINIMUM req.version = 0 model_inst = SimpleModel() model_inst.build(SimpleModel.input_shapes()) for variable in model_inst.trainable_variables: master.set_model_var(variable.name, variable.numpy()) # Get version 0 model = master.GetModel(req, None) self.assertEqual(0, model.version) self.assertEqual( [ "dense_1/bias:0", "dense_1/kernel:0", "dense_2/bias:0", "dense_2/kernel:0", ], list(sorted(model.param.keys())), )
def testEvaluationService(self): with tempfile.TemporaryDirectory() as tempdir: chkp_dir = os.path.join(tempdir, "testEvaluationService") checkpoint_service = CheckpointService(chkp_dir, 5, 5, True) task_d = _TaskDispatcher( {"f1": (0, 10), "f2": (0, 10)}, {"f1": (0, 10), "f2": (0, 10)}, {}, 3, 1, ) # Evaluation metrics will not be accepted if no evaluation ongoing evaluation_service = EvaluationService( checkpoint_service, None, task_d, 10, 20, 0, False ) evaluation_metrics = { "mse": ndarray_to_tensor( np.array([100, 200], dtype=np.float32) ) } self.assertFalse( evaluation_service.report_evaluation_metrics( 1, evaluation_metrics ) ) # No checkpoint available self.assertFalse(evaluation_service.try_to_create_new_job()) master = MasterServicer( 2, 2, None, task_d, init_var=[], checkpoint_filename_for_init="", checkpoint_service=checkpoint_service, evaluation_service=evaluation_service, ) master.set_model_var("x", np.array([1.0, 1.0], dtype=np.float32)) # Add an evaluation task and we can start evaluation self.assertEqual(8, len(task_d._todo)) evaluation_service.add_evaluation_task(0) self.assertEqual(16, len(task_d._todo)) self.assertFalse(evaluation_service._eval_job.finished()) for i in range(8): self.assertFalse(evaluation_service._eval_job.finished()) evaluation_service.complete_task() self.assertTrue(evaluation_service._eval_job is None) self.assertFalse(evaluation_service.try_to_create_new_job())
def testGetModel(self): master = MasterServicer( 2, 3, None, None, init_var=[], checkpoint_filename_for_init="", checkpoint_service=CheckpointService("", 0, 0, False), evaluation_service=None, ) master.set_model_var("x", np.array([1.0, 1.0], dtype=np.float32)) # Now master model is version 0 self.assertEqual(0, master._version) # Get version 0 with minimum method req = elasticdl_pb2.GetModelRequest() req.version = 0 req.method = elasticdl_pb2.MINIMUM model = master.GetModel(req, None) self.assertEqual(0, model.version) self.assertEqual(["x"], list(model.param.keys())) np.testing.assert_array_equal( np.array([1.0, 1.0]), tensor_to_ndarray(model.param["x"]) ) # Increase master model version to 1, but still request # version 0 with minimum method, we should get version 1 master._version = 1 master.set_model_var("x", np.array([2.0, 2.0], dtype=np.float32)) master.set_model_var("y", np.array([12.0, 13.0], dtype=np.float32)) model = master.GetModel(req, None) self.assertEqual(1, model.version) self.assertEqual(["x", "y"], list(sorted(model.param.keys()))) np.testing.assert_array_equal( np.array([2.0, 2.0]), tensor_to_ndarray(model.param["x"]) ) np.testing.assert_array_equal( np.array([12.0, 13.0]), tensor_to_ndarray(model.param["y"]) ) # Try to get version 2, it should raise exception. req.version = 2 self.assertRaises(ValueError, master.GetModel, req, None) # Get fixed version 1 req.method = elasticdl_pb2.FIXED req.version = 1 model = master.GetModel(req, None) self.assertEqual(1, model.version) self.assertEqual(["x", "y"], list(sorted(model.param.keys()))) np.testing.assert_array_equal( np.array([2.0, 2.0]), tensor_to_ndarray(model.param["x"]) ) np.testing.assert_array_equal( np.array([12.0, 13.0]), tensor_to_ndarray(model.param["y"]) ) # Previous model unavailable due to no checkpoint req.version = 0 model = master.GetModel(req, None) self.assertFalse(model.param) # Previous model available through checkpoint with tempfile.TemporaryDirectory() as tempdir: chk_dir = os.path.join(tempdir, "testGetModel") os.makedirs(chk_dir) req.version = master._version req.method = elasticdl_pb2.MINIMUM model = master.GetModel(req, None) master._checkpoint_service = CheckpointService( chk_dir, 2, 5, False ) master._checkpoint_service.save(master._version, model, False) master._version = 2 master.set_model_var("z", np.array([2.0, 2.0], dtype=np.float32)) req.version = 1 req.method = elasticdl_pb2.FIXED model = master.GetModel(req, None) self.assertEqual(1, model.version) self.assertEqual(["x", "y"], list(sorted(model.param.keys()))) np.testing.assert_array_equal( np.array([2.0, 2.0]), tensor_to_ndarray(model.param["x"]) ) np.testing.assert_array_equal( np.array([12.0, 13.0]), tensor_to_ndarray(model.param["y"]) )
def testReportGradient(self): def makeGrad(): """ Make a ReportGradientRequest compatible with model""" req = elasticdl_pb2.ReportGradientRequest() req.gradient["x"].CopyFrom( ndarray_to_tensor(np.array([0.1], dtype=np.float32)) ) req.gradient["y"].CopyFrom( ndarray_to_tensor(np.array([0.03, 0.06], dtype=np.float32)) ) req.model_version = 1 return req master = MasterServicer( 3, 3, tf.optimizers.SGD(0.1), None, init_var=[], checkpoint_filename_for_init="", checkpoint_service=CheckpointService("", 0, 0, False), evaluation_service=None, ) master._version = 1 master.set_model_var("x", np.array([2.0], dtype=np.float32)) master.set_model_var("y", np.array([12.0, 13.0], dtype=np.float32)) # Report a future version, should raise exception req = makeGrad() req.model_version = 2 self.assertRaises(ValueError, master.ReportGradient, req, None) # Report an old version, should not be accepted req = makeGrad() req.model_version = 0 res = master.ReportGradient(req, None) self.assertFalse(res.accepted) self.assertEqual(1, res.model_version) # Report a unknown gradient, should raise. req = makeGrad() req.gradient["z"].CopyFrom( ndarray_to_tensor(np.array([0.1], dtype=np.float32)) ) self.assertRaises(ValueError, master.ReportGradient, req, None) # Report an incompatible gradient, should raise. req = makeGrad() req.gradient["y"].CopyFrom( ndarray_to_tensor(np.array([0.1], dtype=np.float32)) ) self.assertRaises(ValueError, master.ReportGradient, req, None) # Report a current version, should be accepted. req = makeGrad() res = master.ReportGradient(req, None) self.assertTrue(res.accepted) self.assertEqual(1, res.model_version) # Report a current version with part of gradients, should be accepted. req = makeGrad() del req.gradient["y"] res = master.ReportGradient(req, None) self.assertTrue(res.accepted) self.assertEqual(1, res.model_version) # Gradient should be accumulated. np.testing.assert_array_equal( np.array([0.2], dtype=np.float32), master._gradient_sum["x"] ) np.testing.assert_array_equal( np.array([0.03, 0.06], dtype=np.float32), master._gradient_sum["y"] ) self.assertEqual(2, master._grad_n) # Report a current version, should be accepted, and a new version # created req = makeGrad() res = master.ReportGradient(req, None) self.assertTrue(res.accepted) self.assertEqual(2, res.model_version) self.assertFalse(master._gradient_sum) self.assertEqual(0, master._grad_n) np.testing.assert_array_equal( # [2] - 0.1 * [0.1] np.array([1.99], dtype=np.float32), master._model["x"].numpy(), ) np.testing.assert_array_equal( # [12, 13] - 0.1 * [0.02, 0.04] np.array([11.998, 12.996], dtype=np.float32), master._model["y"].numpy(), )
def distributed_train_and_evaluate( feature_shape, model_zoo_path, model_def, model_params="", eval_metrics_fn="eval_metrics_fn", training=True, dataset_name=DatasetName.IMAGE_DEFAULT, callback_classes=[], use_async=False, get_model_steps=1, ): """Runs distributed training and evaluation with a local master. Grpc calls are mocked by local master call. Args: feature_shape: The shape of model input. model_zoo_path: The directory that contains user-defined model files or a specific model file. model_def: The import path to the model definition function/class in the model zoo, e.g. "cifar10_subclass.CustomModel". model_params: The dictionary of model parameters in a string that will be used to instantiate the model, e.g. "param1=1,param2=2". training: True for job type `TRAIN_WITH_EVALUATION`, False for job type `EVALUATION`. dataset_name: A dataset name from `DatasetName`. callback_classes: A List of callbacks that will be called at given stages of the training procedure. use_async: A python bool. True if using asynchronous updates. get_model_steps: Worker will perform `get_model` from the parameter server every this many steps. Returns: An integer indicating the model version after the distributed training and evaluation. """ job_type = (JobType.TRAINING_WITH_EVALUATION if training else JobType.EVALUATION_ONLY) batch_size = 8 if dataset_name == DatasetName.IMAGENET else 16 arguments = [ "--worker_id", "1", "--job_type", job_type, "--minibatch_size", batch_size, "--model_zoo", model_zoo_path, "--model_def", model_def, "--model_params", model_params, "--get_model_steps", get_model_steps, ] args = parse_worker_args(arguments) worker = Worker(args) if dataset_name in [DatasetName.IMAGENET, DatasetName.FRAPPE]: record_num = batch_size else: record_num = 128 shards = { create_recordio_file(record_num, dataset_name, feature_shape): ( 0, record_num, ) } if training: training_shards = shards evaluation_shards = shards else: training_shards = {} evaluation_shards = shards task_d = _TaskDispatcher( training_shards, evaluation_shards, {}, records_per_task=64, num_epochs=1, ) model_module = load_module(get_module_file_path(model_zoo_path, model_def)).__dict__ checkpoint_service = CheckpointService("", 0, 0, True) if training: evaluation_service = EvaluationService( checkpoint_service, None, task_d, 0, 0, 1, False, model_module[eval_metrics_fn], ) else: evaluation_service = EvaluationService( checkpoint_service, None, task_d, 0, 0, 0, True, model_module[eval_metrics_fn], ) task_d.set_evaluation_service(evaluation_service) grads_to_wait = 1 if use_async else 2 master = MasterServicer( grads_to_wait, batch_size, worker._opt_fn(), task_d, init_var=[], checkpoint_filename_for_init="", checkpoint_service=checkpoint_service, evaluation_service=evaluation_service, use_async=use_async, ) callbacks = [ callback_class(master, worker) for callback_class in callback_classes ] worker._stub = InProcessMaster(master, callbacks) for var in worker._model.trainable_variables: master.set_model_var(var.name, var.numpy()) worker.run() req = elasticdl_pb2.GetTaskRequest() req.worker_id = 1 task = master.GetTask(req, None) # No more task. if task.shard_name: raise RuntimeError( "There are some tasks unfinished after worker exits.") return master._version
class ParameterSeverModelHandlerTest(unittest.TestCase): def setUp(self): tf.keras.backend.clear_session() self.master = MasterServicer( 2, 3, None, None, init_var=[], checkpoint_filename_for_init="", checkpoint_service=CheckpointService("", 0, 0, False), evaluation_service=None, ) self.master._version = 1 self.model_handler = ModelHandler.get_model_handler( distribution_strategy="ParameterServerStrategy", stub=self.master) def test_get_model_to_train(self): model_inst = custom_model_with_embedding() model_inst = self.model_handler.get_model_to_train(model_inst) self.assertEqual(type(model_inst.layers[1]), Embedding) def test_get_model_to_export(self): model_inst = custom_model_with_embedding() trained_params = _mock_model_trained_params(model_inst) for name, value in trained_params.items(): self.master.set_model_var(name, value) train_model = self.model_handler.get_model_to_train(model_inst) export_model = self.model_handler.get_model_to_export(train_model, dataset=None) test_data = tf.constant([0]) result = export_model.call(test_data).numpy() self.assertEqual(result[0][0], 3.0) def test_get_subclass_model_to_export(self): def _get_dataset(): dataset = tf.data.Dataset.from_tensor_slices( np.random.randint(0, 10, (10, 4))) dataset = dataset.batch(2) return dataset model_inst = CustomModel() dataset = _get_dataset() trained_params = { "custom_model/embedding/embeddings:0": np.ones((4, 2), dtype="float32"), "custom_model/dense/kernel:0": np.ones((2, 1), dtype="float32"), "custom_model/dense/bias:0": np.ones((1), dtype="float32"), } for name, value in trained_params.items(): self.master.set_model_var(name, value) train_model = self.model_handler.get_model_to_train(model_inst) self.assertEqual(type(train_model.embedding), Embedding) export_model = self.model_handler.get_model_to_export(train_model, dataset=dataset) test_data = tf.constant([0]) result = export_model.call(test_data).numpy() self.assertEqual(result[0][0], 3.0)
def distributed_train_and_evaluate( self, feature_shape, model_def, model_params="", training=True, dataset="", ): """ Run distributed training and evaluation with a local master. grpc calls are mocked by local master call. """ job_type = (JobType.TRAINING_ONLY if training else JobType.EVALUATION_ONLY) batch_size = 16 worker = Worker( 1, job_type, batch_size, _model_zoo_path, model_def=model_def, model_params=model_params, channel=None, ) if dataset == "imagenet": batch_size = 8 shards = {create_imagenet_recordio_file(8, feature_shape): (0, 8)} elif dataset == "frappe": shards = { create_frappe_recordio_file(16, feature_shape, 5383): (0, 16) } else: shards = {create_recordio_file(128, feature_shape): (0, 128)} if training: training_shards = shards evaluation_shards = shards else: training_shards = {} evaluation_shards = shards task_d = _TaskDispatcher( training_shards, evaluation_shards, {}, records_per_task=64, num_epochs=1, ) # Initialize checkpoint service checkpoint_service = CheckpointService("", 0, 0, True) if training: evaluation_service = EvaluationService(checkpoint_service, None, task_d, 0, 0, 1, False) else: evaluation_service = EvaluationService(checkpoint_service, None, task_d, 0, 0, 0, True) task_d.set_evaluation_service(evaluation_service) # The master service master = MasterServicer( 2, batch_size, worker._opt_fn(), task_d, init_var=[], checkpoint_filename_for_init="", checkpoint_service=checkpoint_service, evaluation_service=evaluation_service, ) worker._stub = InProcessMaster(master) for var in worker._model.trainable_variables: master.set_model_var(var.name, var.numpy()) worker.run() req = elasticdl_pb2.GetTaskRequest() req.worker_id = 1 task = master.GetTask(req, None) # No more task. self.assertTrue(not task.shard_name)
def distributed_train_and_evaluate(self, training=True): """ Run distributed training and evaluation with a local master. grpc calls are mocked by local master call. """ class _Master(InProcessMaster): def ReportGradient(self, req): if 2 < self._m._version < 80: # For testing of retrain when gradient not accepted. # Increase master version to reject the gradient. self._m._version += 1 return self._m.ReportGradient(req, None) def ReportEvaluationMetrics(self, req): if 2 < self._m._version < 80: # Testing of evaluation retries. Increase the master # version so the evaluation metrics will not be accepted. self._m._version += 1 return self._m.ReportEvaluationMetrics(req, None) job_type = (JobType.TRAINING_ONLY if training else JobType.EVALUATION_ONLY) batch_size = 16 worker = Worker( 1, job_type, batch_size, _model_zoo_path, model_def="test_module.custom_model", channel=None, ) shards = {create_recordio_file(128): 128} if training: training_shards = shards evaluation_shards = {} else: training_shards = {} evaluation_shards = shards task_d = _TaskDispatcher( training_shards, evaluation_shards, {}, records_per_task=64, num_epochs=1, ) if not training: evaluation_service = EvaluationService(None, None, task_d, 0, 0, 0, True) task_d.set_evaluation_service(evaluation_service) else: evaluation_service = None master = MasterServicer( 2, batch_size, worker._opt_fn(), task_d, init_var=[], checkpoint_filename_for_init="", checkpoint_service=None, evaluation_service=evaluation_service, ) worker._stub = _Master(master) for var in worker._model.trainable_variables: master.set_model_var(var.name, var.numpy()) worker.run() req = elasticdl_pb2.GetTaskRequest() req.worker_id = 1 task = master.GetTask(req, None) # No more task. self.assertTrue(not task.shard_name)
def distributed_train_and_evaluate( self, training=True, callback_classes=[], use_async=False, grads_to_wait=2, get_model_steps=1, ): """ Run distributed training and evaluation with a local master. grpc calls are mocked by local master call. """ if use_async and grads_to_wait > 1: raise ValueError( "grads_to_wait should be 1 when using asynchronous SGD." ) job_type = ( JobType.TRAINING_ONLY if training else JobType.EVALUATION_ONLY ) batch_size = 16 worker = Worker( 1, job_type, batch_size, _model_zoo_path, model_def="test_module.custom_model", channel=None, get_model_steps=get_model_steps, ) shards = {create_recordio_file(128): (0, 128)} if training: training_shards = shards evaluation_shards = {} else: training_shards = {} evaluation_shards = shards task_d = _TaskDispatcher( training_shards, evaluation_shards, {}, records_per_task=64, num_epochs=1, ) if not training: evaluation_service = EvaluationService( None, None, task_d, 0, 0, 0, True ) task_d.set_evaluation_service(evaluation_service) else: evaluation_service = None master = MasterServicer( grads_to_wait, batch_size, worker._opt_fn(), task_d, init_var=[], checkpoint_filename_for_init="", checkpoint_service=None, evaluation_service=evaluation_service, use_async=use_async, ) callbacks = [ callback_class(master, worker, self) for callback_class in callback_classes ] worker._stub = InProcessMaster(master, callbacks) for var in worker._model.trainable_variables: master.set_model_var(var.name, var.numpy()) worker.run() req = elasticdl_pb2.GetTaskRequest() req.worker_id = 1 task = master.GetTask(req, None) # No more task. self.assertTrue(not task.shard_name)