def testUserDefinedModel(self): master = MasterServicer( 2, 3, None, None, init_var=[], checkpoint_filename_for_init="", checkpoint_service=CheckpointService("", 0, 0, False), evaluation_service=None, ) req = elasticdl_pb2.GetModelRequest() req.method = elasticdl_pb2.MINIMUM req.version = 0 model_inst = SimpleModel() model_inst.build(SimpleModel.input_shapes()) for variable in model_inst.trainable_variables: master.set_model_var(variable.name, variable.numpy()) # Get version 0 model = master.GetModel(req, None) self.assertEqual(0, model.version) self.assertEqual( [ "dense_1/bias:0", "dense_1/kernel:0", "dense_2/bias:0", "dense_2/kernel:0", ], list(sorted(model.param.keys())), )
def testEvaluationOnly(self): task_d = _TaskDispatcher({}, {"f1": (0, 10), "f2": (0, 10)}, {}, 3, 1) evaluation_service = EvaluationService( None, None, task_d, 0, 0, 0, True ) task_d.set_evaluation_service(evaluation_service) master = MasterServicer( 2, 2, None, task_d, init_var=[], checkpoint_filename_for_init="", checkpoint_service=None, evaluation_service=evaluation_service, ) master.set_model_var("x", np.array([1.0, 1.0], dtype=np.float32)) self.assertEqual(8, len(task_d._todo)) for i in range(8): self.assertFalse(evaluation_service._eval_job.finished()) evaluation_service.complete_task() self.assertTrue(evaluation_service._eval_job.finished())
def testSaveLoadCheckpoint(self): init_var = m["custom_model"]().trainable_variables with tempfile.TemporaryDirectory() as tempdir: chkp_dir = os.path.join(tempdir, "testSaveLoadCheckpoint") os.makedirs(chkp_dir) checkpointer = CheckpointService(chkp_dir, 3, 5, False) self.assertTrue(checkpointer.is_enabled()) master = MasterServicer( 2, 3, None, None, init_var=init_var, checkpoint_filename_for_init="", checkpoint_service=checkpointer, evaluation_service=None, ) req = elasticdl_pb2.GetModelRequest() req.method = elasticdl_pb2.MINIMUM req.version = 0 model = master.GetModel(req, None) checkpointer.save(0, model, False) loaded_model = checkpointer.get_checkpoint_model(0) self.assertEqual(model.version, loaded_model.version) for var, loaded_var in zip(model.param, loaded_model.param): self.assertEqual(var, loaded_var)
def testEvaluationService(self): with tempfile.TemporaryDirectory() as tempdir: chkp_dir = os.path.join(tempdir, "testEvaluationService") checkpoint_service = CheckpointService(chkp_dir, 5, 5, True) task_d = _TaskDispatcher( {"f1": (0, 10), "f2": (0, 10)}, {"f1": (0, 10), "f2": (0, 10)}, {}, 3, 1, ) # Evaluation metrics will not be accepted if no evaluation ongoing evaluation_service = EvaluationService( checkpoint_service, None, task_d, 10, 20, 0, False ) evaluation_metrics = { "mse": ndarray_to_tensor( np.array([100, 200], dtype=np.float32) ) } self.assertFalse( evaluation_service.report_evaluation_metrics( 1, evaluation_metrics ) ) # No checkpoint available self.assertFalse(evaluation_service.try_to_create_new_job()) master = MasterServicer( 2, 2, None, task_d, init_var=[], checkpoint_filename_for_init="", checkpoint_service=checkpoint_service, evaluation_service=evaluation_service, ) master.set_model_var("x", np.array([1.0, 1.0], dtype=np.float32)) # Add an evaluation task and we can start evaluation self.assertEqual(8, len(task_d._todo)) evaluation_service.add_evaluation_task(0) self.assertEqual(16, len(task_d._todo)) self.assertFalse(evaluation_service._eval_job.finished()) for i in range(8): self.assertFalse(evaluation_service._eval_job.finished()) evaluation_service.complete_task() self.assertTrue(evaluation_service._eval_job is None) self.assertFalse(evaluation_service.try_to_create_new_job())
def testReportTaskResult(self): task_d = _TaskDispatcher( { "shard_1": (0, 10), "shard_2": (0, 9) }, {}, {}, records_per_task=3, num_epochs=2, ) master = MasterServicer( 3, 3, None, task_d, init_var=[], checkpoint_filename_for_init="", checkpoint_service=CheckpointService("", 0, 0, False), evaluation_service=None, ) # task to number of runs. tasks = defaultdict(int) while True: req = elasticdl_pb2.GetTaskRequest() req.worker_id = random.randint(1, 10) task = master.GetTask(req, None) if not task.shard_name: break self.assertEqual(task_d._doing[task.task_id][0], req.worker_id) task_key = (task.shard_name, task.start, task.end) tasks[task_key] += 1 report = elasticdl_pb2.ReportTaskResultRequest() report.task_id = task.task_id if task.start == 0 and tasks[task_key] == 1: # Simulate error reports. report.err_message = "Worker error" master.ReportTaskResult(report, None) self.assertDictEqual( { ("shard_1", 0, 3): 3, ("shard_1", 3, 6): 2, ("shard_1", 6, 9): 2, ("shard_1", 9, 10): 2, ("shard_2", 0, 3): 3, ("shard_2", 3, 6): 2, ("shard_2", 6, 9): 2, }, tasks, )
def setUp(self): tf.keras.backend.clear_session() self.master = MasterServicer( 2, 3, None, None, init_var=[], checkpoint_filename_for_init="", checkpoint_service=CheckpointService("", 0, 0, False), evaluation_service=None, ) self.master._version = 1 self.model_handler = ModelHandler.get_model_handler( distribution_strategy="ParameterServerStrategy", stub=self.master)
def testEvaluationService(self): task_d = _TaskDispatcher( {"f1": (0, 10), "f2": (0, 10)}, {"f1": (0, 10), "f2": (0, 10)}, {}, 3, 1, ) # Evaluation metrics will not be accepted if no evaluation ongoing evaluation_service = EvaluationService( None, task_d, 10, 20, 0, False, _eval_metrics_fn, ) _ = MasterServicer( 2, task_d, evaluation_service=evaluation_service, master=None ) # No checkpoint available self.assertFalse(evaluation_service.try_to_create_new_job()) # Add an evaluation task and we can start evaluation self.assertEqual(8, len(task_d._todo)) evaluation_service.add_evaluation_task(False) self.assertEqual(8, len(task_d._eval_todo)) self.assertFalse(evaluation_service._eval_job.finished()) for i in range(8): self.assertFalse(evaluation_service._eval_job.finished()) evaluation_service.complete_task() self.assertTrue(evaluation_service._eval_job is None) self.assertFalse(evaluation_service.try_to_create_new_job())
def _create_master_and_worker(self, service_endpoint=None, embedding_dims={}): model_inst = custom_model() master = MasterServicer( 2, 2, tf.optimizers.SGD(0.1), None, init_var=model_inst.trainable_variables, embedding_service_endpoint=service_endpoint, embedding_dims=embedding_dims, checkpoint_filename_for_init=None, checkpoint_service=None, evaluation_service=None, ) arguments = [ "--worker_id", 1, "--job_type", JobType.TRAINING_ONLY, "--minibatch_size", 2, "--model_zoo", _model_zoo_path, "--model_def", "test_module.custom_model", ] args = parse_worker_args(arguments) worker = Worker(args) worker.set_model(model_inst) worker._stub = InProcessMaster(master) return master, worker
def testEvaluationOnly(self): task_d = create_task_manager([], [("f1", 0, 10), ("f2", 0, 10)]) task_d.create_tasks(elasticai_api_pb2.EVALUATION) evaluation_service = EvaluationService(task_d.create_evaluation_tasks, 0, True, _eval_metrics_fn) task_d.set_evaluation_service(evaluation_service) master = Mock( task_d=task_d, instance_manager=None, distribution_strategy=None, ) _ = MasterServicer( master.task_d, master.instance_manager, None, evaluation_service, ) self.assertEqual(8, len(task_d._eval_todo)) for i in range(8): self.assertFalse(evaluation_service._eval_job.finished()) evaluation_service.complete_task() self.assertTrue(evaluation_service._eval_job.finished())
def testEvaluationService(self): task_d = create_task_manager([("f1", 0, 10), ("f2", 0, 10)], [("f1", 0, 10), ("f2", 0, 10)]) # Evaluation metrics will not be accepted if no evaluation ongoing evaluation_service = EvaluationService( task_d.create_evaluation_tasks, 0, False, _eval_metrics_fn, ) master = Mock( task_d=task_d, instance_manager=None, distribution_strategy=None, ) _ = MasterServicer(master.task_d, master.instance_manager, None, evaluation_service) # No checkpoint available self.assertFalse(evaluation_service.try_to_create_new_job()) # Add an evaluation task and we can start evaluation self.assertEqual(8, len(task_d._todo)) evaluation_service.add_evaluation_task(False) self.assertEqual(8, len(task_d._eval_todo)) self.assertFalse(evaluation_service._eval_job.finished()) for i in range(8): self.assertFalse(evaluation_service._eval_job.finished()) evaluation_service.complete_task() self.assertTrue(evaluation_service._eval_job is None) self.assertFalse(evaluation_service.try_to_create_new_job())
def _create_master_service(self, args): self.logger.info("Creating master service") server = grpc.server( futures.ThreadPoolExecutor(max_workers=64), options=[ ("grpc.max_send_message_length", GRPC.MAX_SEND_MESSAGE_LENGTH), ( "grpc.max_receive_message_length", GRPC.MAX_RECEIVE_MESSAGE_LENGTH, ), ], ) master_servicer = MasterServicer( args.grads_to_wait, args.minibatch_size, self.optimizer, self.task_d, init_var=self.model_inst.trainable_variables if self.model_inst.built else [], embedding_dims=self.embedding_dims, checkpoint_filename_for_init=args.checkpoint_filename_for_init, checkpoint_service=self.checkpoint_service, evaluation_service=self.evaluation_service, embedding_service_endpoint=self.embedding_service_endpoint, lr_staleness_modulation=args.lr_staleness_modulation, use_async=args.use_async, ) elasticdl_pb2_grpc.add_MasterServicer_to_server( master_servicer, server) server.add_insecure_port("[::]:{}".format(args.port)) self.logger.info("The port of the master server is: %d", args.port) return master_servicer, server
def _create_master_and_worker(self, service_endpoint=None, embedding_dims={}): model_inst = custom_model() master = MasterServicer( 2, 2, tf.optimizers.SGD(0.1), None, init_var=model_inst.trainable_variables, embedding_service_endpoint=service_endpoint, embedding_dims=embedding_dims, checkpoint_filename_for_init=None, checkpoint_service=None, evaluation_service=None, ) worker = Worker( 1, JobType.TRAINING_ONLY, 2, _model_zoo_path, model_def="test_module.custom_model", channel=None, ) worker.set_model(model_inst) worker._stub = InProcessMaster(master) return master, worker
def master_creator(): return MasterServicer( batch_size, task_d, evaluation_service=evaluation_service, master=None, )
def _create_master_service(self, args): self.logger.info("Creating master service") server = grpc.server( futures.ThreadPoolExecutor(max_workers=64), options=[ ("grpc.max_send_message_length", GRPC.MAX_SEND_MESSAGE_LENGTH), ( "grpc.max_receive_message_length", GRPC.MAX_RECEIVE_MESSAGE_LENGTH, ), ], ) master_servicer = MasterServicer( args.minibatch_size, self.task_d, evaluation_service=self.evaluation_service, master=self, ) elasticdl_pb2_grpc.add_MasterServicer_to_server( master_servicer, server ) server.add_insecure_port("[::]:{}".format(args.port)) self.logger.info("The port of the master server is: %d", args.port) return master_servicer, server
def testInitFromCheckpoint(self): init_var = m["custom_model"]().trainable_variables with tempfile.TemporaryDirectory() as tempdir: chkp_dir = os.path.join(tempdir, "testInitFromCheckpoint") os.makedirs(chkp_dir) master = MasterServicer( 2, 3, None, None, init_var=init_var, checkpoint_filename_for_init="", checkpoint_service=CheckpointService(chkp_dir, 2, 3, False), evaluation_service=None, ) req = elasticdl_pb2.GetModelRequest() req.method = elasticdl_pb2.MINIMUM req.version = 0 model = master.GetModel(req, None) master._checkpoint_service.save(master._version, model, False) chkp_file = master._checkpoint_service.get_checkpoint_path( master._version ) # Create variables from init_var, get init value from checkpoint. master2 = MasterServicer( 2, 3, None, None, init_var=init_var, checkpoint_filename_for_init=chkp_file, checkpoint_service=CheckpointService("", 0, 0, False), evaluation_service=None, ) model2 = master2.GetModel(req, None) self.assertEqual(model, model2) # Create variables from checkpoint. master3 = MasterServicer( 2, 3, None, None, init_var=[], checkpoint_filename_for_init=chkp_file, checkpoint_service=CheckpointService("", 0, 0, False), evaluation_service=None, ) model3 = master3.GetModel(req, None) self.assertEqual(model, model3)
def testGetEmptyTask(self): master = MasterServicer( 3, _TaskDispatcher({}, {}, {}, records_per_task=3, num_epochs=2), evaluation_service=None, ) req = elasticdl_pb2.GetTaskRequest() # No task yet, make sure the returned versions are as expected. req.worker_id = 1 task = master.get_task(req, None) self.assertEqual("", task.shard_name) self.assertEqual(0, task.model_version) master._version = 1 task = master.get_task(req, None) self.assertEqual("", task.shard_name) self.assertEqual(1, task.model_version)
def test_report_task_result(self): self.master.task_manager = create_task_manager([("shard_1", 0, 10), ("shard_2", 0, 9)], [], 2) master = MasterServicer( self.master.task_manager, self.master.instance_manager, None, None, ) # task to number of runs. tasks = defaultdict(int) while True: req = elasticai_api_pb2.GetTaskRequest() req.worker_id = random.randint(1, 10) task = master.get_task(req, None) if not task.shard.name: break self.assertEqual(self.master.task_manager._doing[task.task_id][0], req.worker_id) task_key = (task.shard.name, task.shard.start, task.shard.end) tasks[task_key] += 1 report = elasticai_api_pb2.ReportTaskResultRequest() report.task_id = task.task_id if task.shard.start == 0 and tasks[task_key] == 1: # Simulate error reports. report.err_message = "Worker error" master.report_task_result(report, None) self.assertDictEqual( { ("shard_1", 0, 3): 3, ("shard_1", 3, 6): 2, ("shard_1", 6, 9): 2, ("shard_1", 9, 10): 2, ("shard_2", 0, 3): 3, ("shard_2", 3, 6): 2, ("shard_2", 6, 9): 2, }, tasks, )
def test_get_comm_rank(self): self.master.rendezvous_server = HorovodRendezvousServer( server_host="localhost") self.master.rendezvous_server.start() self.master.rendezvous_server.set_worker_hosts( ["172.0.0.1", "172.0.0.2"]) k8s_client = Mock() k8s_client.get_worker_service_address = MagicMock( return_value="172.0.0.1:8080") self.master.instance_manager = Mock(_k8s_client=k8s_client) master_servicer = MasterServicer(3, evaluation_service=None, master=self.master) request = elasticdl_pb2.GetCommRankRequest() request.worker_id = 0 rank_response = master_servicer.get_comm_rank(request, None) self.assertEqual(rank_response.world_size, 2) self.assertEqual(rank_response.rank_id, 0) self.assertEqual(rank_response.rendezvous_id, 1)
def test_get_empty_task(self): self.master.task_manager = create_task_manager([], []) master_servicer = MasterServicer( self.master.task_manager, self.master.instance_manager, None, None, ) req = elasticai_api_pb2.GetTaskRequest() # No task yet, make sure the returned versions are as expected. req.worker_id = 1 task = master_servicer.get_task(req, None) self.assertEqual("", task.shard.name) self.assertEqual(0, task.model_version) master_servicer._version = 1 task = master_servicer.get_task(req, None) self.assertEqual("", task.shard.name) self.assertEqual(1, task.model_version)
def test_get_comm_rank(self): self.master.rendezvous_server = HorovodRendezvousServer( server_host="localhost") self.master.rendezvous_server.start() self.master.rendezvous_server.set_worker_hosts([ ("worker-0", "172.0.0.1"), ("worker-1", "172.0.0.2") ]) mock_instance_manager = Mock() mock_instance_manager.get_worker_pod_ip = MagicMock( return_value="172.0.0.1") self.master.instance_manager = mock_instance_manager master_servicer = MasterServicer(3, evaluation_service=None, master=self.master) request = elasticdl_pb2.GetCommRankRequest() request.worker_id = 0 rank_response = master_servicer.get_comm_rank(request, None) self.assertEqual(rank_response.world_size, 2) self.assertEqual(rank_response.rank_id, 0) self.assertEqual(rank_response.rendezvous_id, 1)
def test_get_comm_rank(self): self.master.rendezvous_server = HorovodRendezvousServer( server_host="localhost") self.master.rendezvous_server.start() self.master.rendezvous_server.add_worker("172.0.0.1") self.master.rendezvous_server.add_worker("172.0.0.2") mock_instance_manager = Mock() mock_instance_manager.get_worker_pod_ip = MagicMock( return_value="172.0.0.1") self.master.instance_manager = mock_instance_manager master_servicer = MasterServicer( self.master.task_manager, self.master.instance_manager, self.master.rendezvous_server, None, ) request = elasticai_api_pb2.GetCommRankRequest() request.worker_host = "172.0.0.1" rank_response = master_servicer.get_comm_rank(request, None) self.assertEqual(rank_response.world_size, 2) self.assertEqual(rank_response.rank_id, 0) self.assertEqual(rank_response.rendezvous_id, 1)
def testGetEmptyTask(self): master = MasterServicer( 2, 3, None, _TaskDispatcher({}, {}, {}, records_per_task=3, num_epochs=2), init_var=[], checkpoint_filename_for_init="", checkpoint_service=CheckpointService("", 0, 0, False), evaluation_service=None, ) req = elasticdl_pb2.GetTaskRequest() # No task yet, make sure the returned versions are as expected. req.worker_id = 1 task = master.GetTask(req, None) self.assertEqual("", task.shard_file_name) self.assertEqual(0, task.model_version) master._version = 1 task = master.GetTask(req, None) self.assertEqual("", task.shard_file_name) self.assertEqual(1, task.model_version)
def testEvaluationOnly(self): task_d = _TaskDispatcher({}, {"f1": (0, 10), "f2": (0, 10)}, {}, 3, 1) evaluation_service = EvaluationService( None, task_d, 0, 0, 0, True, _eval_metrics_fn ) task_d.set_evaluation_service(evaluation_service) _ = MasterServicer( 2, task_d, evaluation_service=evaluation_service, master=None, ) self.assertEqual(8, len(task_d._eval_todo)) for i in range(8): self.assertFalse(evaluation_service._eval_job.finished()) evaluation_service.complete_task() self.assertTrue(evaluation_service._eval_job.finished())
def distributed_train_and_evaluate( feature_shape, model_zoo_path, model_def, model_params="", eval_metrics_fn="eval_metrics_fn", loss="loss", training=True, dataset_name=DatasetName.IMAGE_DEFAULT, callback_classes=[], use_async=False, get_model_steps=1, ps_channels=None, pservers=None, distribution_strategy=DistributionStrategy.PARAMETER_SERVER, ): """Runs distributed training and evaluation with a local master. Grpc calls are mocked by local master call. Args: feature_shape: The shape of model input. model_zoo_path: The directory that contains user-defined model files or a specific model file. model_def: The import path to the model definition function/class in the model zoo, e.g. "cifar10_subclass.CustomModel". model_params: The dictionary of model parameters in a string that will be used to instantiate the model, e.g. "param1=1,param2=2". eval_metrics_fn: The name of the evaluation metrics function defined in the model file. loss: The name of the loss function defined in the model file. training: True for job type `TRAIN_WITH_EVALUATION`, False for job type `EVALUATION`. dataset_name: A dataset name from `DatasetName`. callback_classes: A List of callbacks that will be called at given stages of the training procedure. use_async: A bool. True if using asynchronous updates. get_model_steps: Worker will perform `get_model` from the parameter server every this many steps. ps_channels: A channel list to all parameter server pods. pservers: A list of parameter server pods. distribution_strategy: The distribution startegy used by workers, e.g. DistributionStrategy.PARAMETER_SERVER or DistributionStrategy.AllreduceStrategy. Returns: An integer indicating the model version after the distributed training and evaluation. """ job_type = (JobType.TRAINING_WITH_EVALUATION if training else JobType.EVALUATION_ONLY) evaluation_steps = 1 if job_type == JobType.TRAINING_WITH_EVALUATION else 0 batch_size = 8 if dataset_name == DatasetName.IMAGENET else 16 pservers = pservers or [] ps_channels = ps_channels or [] model_module = load_module(get_module_file_path(model_zoo_path, model_def)).__dict__ for channel in ps_channels: grpc.channel_ready_future(channel).result() worker_arguments = [ "--worker_id", "1", "--job_type", job_type, "--minibatch_size", batch_size, "--model_zoo", model_zoo_path, "--model_def", model_def, "--model_params", model_params, "--loss", loss, "--get_model_steps", get_model_steps, "--distribution_strategy", distribution_strategy, ] args = parse_worker_args(worker_arguments) worker = Worker(args, ps_channels=ps_channels) if dataset_name in [DatasetName.IMAGENET, DatasetName.FRAPPE]: record_num = batch_size else: record_num = 128 shards = { create_recordio_file(record_num, dataset_name, feature_shape): ( 0, record_num, ) } if training: training_shards = shards evaluation_shards = shards else: training_shards = {} evaluation_shards = shards task_d = _TaskDispatcher( training_shards, evaluation_shards, {}, records_per_task=64, num_epochs=1, ) if training: evaluation_service = EvaluationService( None, task_d, 0, 0, evaluation_steps, False, model_module[eval_metrics_fn], ) else: evaluation_service = EvaluationService( None, task_d, 0, 0, evaluation_steps, True, model_module[eval_metrics_fn], ) task_d.set_evaluation_service(evaluation_service) master = MasterServicer( batch_size, task_d, evaluation_service=evaluation_service, ) callbacks = [ callback_class(master, worker) for callback_class in callback_classes ] in_process_master = InProcessMaster(master, callbacks) worker._stub = in_process_master for pservicer in pservers: pservicer._master_stub = in_process_master worker.run() req = elasticdl_pb2.GetTaskRequest() req.worker_id = 1 task = master.get_task(req, None) # No more task. if task.shard_name: raise RuntimeError( "There are some tasks unfinished after worker exits.") return master._version
def testGetModel(self): master = MasterServicer( 2, 3, None, None, init_var=[], checkpoint_filename_for_init="", checkpoint_service=CheckpointService("", 0, 0, False), evaluation_service=None, ) master.set_model_var("x", np.array([1.0, 1.0], dtype=np.float32)) # Now master model is version 0 self.assertEqual(0, master._version) # Get version 0 with minimum method req = elasticdl_pb2.GetModelRequest() req.version = 0 req.method = elasticdl_pb2.MINIMUM model = master.GetModel(req, None) self.assertEqual(0, model.version) self.assertEqual(["x"], list(model.param.keys())) np.testing.assert_array_equal( np.array([1.0, 1.0]), tensor_to_ndarray(model.param["x"]) ) # Increase master model version to 1, but still request # version 0 with minimum method, we should get version 1 master._version = 1 master.set_model_var("x", np.array([2.0, 2.0], dtype=np.float32)) master.set_model_var("y", np.array([12.0, 13.0], dtype=np.float32)) model = master.GetModel(req, None) self.assertEqual(1, model.version) self.assertEqual(["x", "y"], list(sorted(model.param.keys()))) np.testing.assert_array_equal( np.array([2.0, 2.0]), tensor_to_ndarray(model.param["x"]) ) np.testing.assert_array_equal( np.array([12.0, 13.0]), tensor_to_ndarray(model.param["y"]) ) # Try to get version 2, it should raise exception. req.version = 2 self.assertRaises(ValueError, master.GetModel, req, None) # Get fixed version 1 req.method = elasticdl_pb2.FIXED req.version = 1 model = master.GetModel(req, None) self.assertEqual(1, model.version) self.assertEqual(["x", "y"], list(sorted(model.param.keys()))) np.testing.assert_array_equal( np.array([2.0, 2.0]), tensor_to_ndarray(model.param["x"]) ) np.testing.assert_array_equal( np.array([12.0, 13.0]), tensor_to_ndarray(model.param["y"]) ) # Previous model unavailable due to no checkpoint req.version = 0 model = master.GetModel(req, None) self.assertFalse(model.param) # Previous model available through checkpoint with tempfile.TemporaryDirectory() as tempdir: chk_dir = os.path.join(tempdir, "testGetModel") os.makedirs(chk_dir) req.version = master._version req.method = elasticdl_pb2.MINIMUM model = master.GetModel(req, None) master._checkpoint_service = CheckpointService( chk_dir, 2, 5, False ) master._checkpoint_service.save(master._version, model, False) master._version = 2 master.set_model_var("z", np.array([2.0, 2.0], dtype=np.float32)) req.version = 1 req.method = elasticdl_pb2.FIXED model = master.GetModel(req, None) self.assertEqual(1, model.version) self.assertEqual(["x", "y"], list(sorted(model.param.keys()))) np.testing.assert_array_equal( np.array([2.0, 2.0]), tensor_to_ndarray(model.param["x"]) ) np.testing.assert_array_equal( np.array([12.0, 13.0]), tensor_to_ndarray(model.param["y"]) )
def testReportGradient(self): def makeGrad(): """ Make a ReportGradientRequest compatible with model""" req = elasticdl_pb2.ReportGradientRequest() req.gradient["x"].CopyFrom( ndarray_to_tensor(np.array([0.1], dtype=np.float32)) ) req.gradient["y"].CopyFrom( ndarray_to_tensor(np.array([0.03, 0.06], dtype=np.float32)) ) req.model_version = 1 return req master = MasterServicer( 3, 3, tf.optimizers.SGD(0.1), None, init_var=[], checkpoint_filename_for_init="", checkpoint_service=CheckpointService("", 0, 0, False), evaluation_service=None, ) master._version = 1 master.set_model_var("x", np.array([2.0], dtype=np.float32)) master.set_model_var("y", np.array([12.0, 13.0], dtype=np.float32)) # Report a future version, should raise exception req = makeGrad() req.model_version = 2 self.assertRaises(ValueError, master.ReportGradient, req, None) # Report an old version, should not be accepted req = makeGrad() req.model_version = 0 res = master.ReportGradient(req, None) self.assertFalse(res.accepted) self.assertEqual(1, res.model_version) # Report a unknown gradient, should raise. req = makeGrad() req.gradient["z"].CopyFrom( ndarray_to_tensor(np.array([0.1], dtype=np.float32)) ) self.assertRaises(ValueError, master.ReportGradient, req, None) # Report an incompatible gradient, should raise. req = makeGrad() req.gradient["y"].CopyFrom( ndarray_to_tensor(np.array([0.1], dtype=np.float32)) ) self.assertRaises(ValueError, master.ReportGradient, req, None) # Report a current version, should be accepted. req = makeGrad() res = master.ReportGradient(req, None) self.assertTrue(res.accepted) self.assertEqual(1, res.model_version) # Report a current version with part of gradients, should be accepted. req = makeGrad() del req.gradient["y"] res = master.ReportGradient(req, None) self.assertTrue(res.accepted) self.assertEqual(1, res.model_version) # Gradient should be accumulated. np.testing.assert_array_equal( np.array([0.2], dtype=np.float32), master._gradient_sum["x"] ) np.testing.assert_array_equal( np.array([0.03, 0.06], dtype=np.float32), master._gradient_sum["y"] ) self.assertEqual(2, master._grad_n) # Report a current version, should be accepted, and a new version # created req = makeGrad() res = master.ReportGradient(req, None) self.assertTrue(res.accepted) self.assertEqual(2, res.model_version) self.assertFalse(master._gradient_sum) self.assertEqual(0, master._grad_n) np.testing.assert_array_equal( # [2] - 0.1 * [0.1] np.array([1.99], dtype=np.float32), master._model["x"].numpy(), ) np.testing.assert_array_equal( # [12, 13] - 0.1 * [0.02, 0.04] np.array([11.998, 12.996], dtype=np.float32), master._model["y"].numpy(), )
def distributed_train_and_evaluate( feature_shape, model_zoo_path, model_def, model_params="", eval_metrics_fn="eval_metrics_fn", training=True, dataset_name=DatasetName.IMAGE_DEFAULT, callback_classes=[], use_async=False, get_model_steps=1, ): """Runs distributed training and evaluation with a local master. Grpc calls are mocked by local master call. Args: feature_shape: The shape of model input. model_zoo_path: The directory that contains user-defined model files or a specific model file. model_def: The import path to the model definition function/class in the model zoo, e.g. "cifar10_subclass.CustomModel". model_params: The dictionary of model parameters in a string that will be used to instantiate the model, e.g. "param1=1,param2=2". training: True for job type `TRAIN_WITH_EVALUATION`, False for job type `EVALUATION`. dataset_name: A dataset name from `DatasetName`. callback_classes: A List of callbacks that will be called at given stages of the training procedure. use_async: A python bool. True if using asynchronous updates. get_model_steps: Worker will perform `get_model` from the parameter server every this many steps. Returns: An integer indicating the model version after the distributed training and evaluation. """ job_type = (JobType.TRAINING_WITH_EVALUATION if training else JobType.EVALUATION_ONLY) batch_size = 8 if dataset_name == DatasetName.IMAGENET else 16 arguments = [ "--worker_id", "1", "--job_type", job_type, "--minibatch_size", batch_size, "--model_zoo", model_zoo_path, "--model_def", model_def, "--model_params", model_params, "--get_model_steps", get_model_steps, ] args = parse_worker_args(arguments) worker = Worker(args) if dataset_name in [DatasetName.IMAGENET, DatasetName.FRAPPE]: record_num = batch_size else: record_num = 128 shards = { create_recordio_file(record_num, dataset_name, feature_shape): ( 0, record_num, ) } if training: training_shards = shards evaluation_shards = shards else: training_shards = {} evaluation_shards = shards task_d = _TaskDispatcher( training_shards, evaluation_shards, {}, records_per_task=64, num_epochs=1, ) model_module = load_module(get_module_file_path(model_zoo_path, model_def)).__dict__ checkpoint_service = CheckpointService("", 0, 0, True) if training: evaluation_service = EvaluationService( checkpoint_service, None, task_d, 0, 0, 1, False, model_module[eval_metrics_fn], ) else: evaluation_service = EvaluationService( checkpoint_service, None, task_d, 0, 0, 0, True, model_module[eval_metrics_fn], ) task_d.set_evaluation_service(evaluation_service) grads_to_wait = 1 if use_async else 2 master = MasterServicer( grads_to_wait, batch_size, worker._opt_fn(), task_d, init_var=[], checkpoint_filename_for_init="", checkpoint_service=checkpoint_service, evaluation_service=evaluation_service, use_async=use_async, ) callbacks = [ callback_class(master, worker) for callback_class in callback_classes ] worker._stub = InProcessMaster(master, callbacks) for var in worker._model.trainable_variables: master.set_model_var(var.name, var.numpy()) worker.run() req = elasticdl_pb2.GetTaskRequest() req.worker_id = 1 task = master.GetTask(req, None) # No more task. if task.shard_name: raise RuntimeError( "There are some tasks unfinished after worker exits.") return master._version
def main(): args = parse_args() logger = get_logger("master", level=args.log_level.upper()) # Master addr master_ip = os.getenv("MY_POD_IP", "localhost") master_addr = "%s:%d" % (master_ip, args.port) # Start TensorBoard service if requested if args.tensorboard_log_dir: logger.info( "Starting TensorBoard service with log directory %s", args.tensorboard_log_dir, ) # Start TensorBoard CLI tb_service = TensorboardService(args.tensorboard_log_dir, master_ip) tb_service.start() else: tb_service = None # Start task queue logger.debug( "Starting task queue with training data directory %s, " "evaluation data directory %s, " "and prediction data directory %s", args.training_data_dir, args.evaluation_data_dir, args.prediction_data_dir, ) task_d = _make_task_dispatcher( args.training_data_dir, args.evaluation_data_dir, args.prediction_data_dir, args.records_per_task, args.num_epochs, ) model_module = load_module( get_module_file_path(args.model_zoo, args.model_def)).__dict__ model_inst = load_model_from_module(args.model_def, model_module, args.model_params) optimizer = model_module[args.optimizer]() if all(( args.training_data_dir, args.evaluation_data_dir, args.evaluation_throttle_secs or args.evaluation_steps, )): job_type = JobType.TRAINING_WITH_EVALUATION elif all(( args.evaluation_data_dir, not args.training_data_dir, not args.prediction_data_dir, )): job_type = JobType.EVALUATION_ONLY elif all(( args.prediction_data_dir, not args.evaluation_data_dir, not args.training_data_dir, )): job_type = JobType.PREDICTION_ONLY else: job_type = JobType.TRAINING_ONLY # Initialize checkpoint service if args.checkpoint_steps or job_type == JobType.TRAINING_WITH_EVALUATION: logger.info("Starting checkpoint service") checkpoint_service = CheckpointService( args.checkpoint_dir, args.checkpoint_steps, args.keep_checkpoint_max, job_type == JobType.TRAINING_WITH_EVALUATION, ) else: checkpoint_service = None # Initialize evaluation service evaluation_service = None if (job_type == JobType.TRAINING_WITH_EVALUATION or job_type == JobType.EVALUATION_ONLY): logger.info( "Starting evaluation service with throttle seconds %d " " and evaluation steps %d", args.evaluation_throttle_secs, args.evaluation_steps, ) evaluation_service = EvaluationService( checkpoint_service, tb_service, task_d, args.evaluation_start_delay_secs, args.evaluation_throttle_secs, args.evaluation_steps, job_type == JobType.EVALUATION_ONLY, ) evaluation_service.start() task_d.set_evaluation_service(evaluation_service) embedding_service_endpoint = None embedding_dims = {} # Search for embedding layers in the model, # if found, initialize embedding service layers = find_layer(model_inst, Embedding) if layers: embedding_service = EmbeddingService() embedding_service_endpoint = embedding_service.start_embedding_service( job_name=args.job_name, image_name=args.worker_image, namespace=args.namespace, resource_request=args.master_resource_request, resource_limit=args.master_resource_limit, pod_priority=args.worker_pod_priority, volume=args.volume, image_pull_policy=args.image_pull_policy, restart_policy=args.restart_policy, cluster_spec=args.cluster_spec, ) logger.info("Embedding service start succeeded. The endpoint is %s." % str(embedding_service_endpoint)) embedding_dims = dict([(layer.name, layer.output_dim) for layer in layers]) # The master service logger.info("Starting master service") server = grpc.server( futures.ThreadPoolExecutor(max_workers=64), options=[ ("grpc.max_send_message_length", GRPC.MAX_SEND_MESSAGE_LENGTH), ( "grpc.max_receive_message_length", GRPC.MAX_RECEIVE_MESSAGE_LENGTH, ), ], ) master_servicer = MasterServicer( args.grads_to_wait, args.minibatch_size, optimizer, task_d, init_var=model_inst.trainable_variables if model_inst.built else [], embedding_dims=embedding_dims, checkpoint_filename_for_init=args.checkpoint_filename_for_init, checkpoint_service=checkpoint_service, evaluation_service=evaluation_service, embedding_service_endpoint=embedding_service_endpoint, lr_staleness_modulation=args.lr_staleness_modulation, use_async=args.use_async, ) elasticdl_pb2_grpc.add_MasterServicer_to_server(master_servicer, server) server.add_insecure_port("[::]:{}".format(args.port)) server.start() logger.info("Server started at port: %d", args.port) worker_manager = None if args.num_workers: assert args.worker_image, "Worker image cannot be empty" worker_command = ["python"] worker_args = [ "-m", "elasticdl.python.worker.main", "--model_zoo", args.model_zoo, "--master_addr", master_addr, "--log_level", args.log_level, "--dataset_fn", args.dataset_fn, "--loss", args.loss, "--optimizer", args.optimizer, "--eval_metrics_fn", args.eval_metrics_fn, "--model_def", args.model_def, "--job_type", job_type, "--minibatch_size", str(args.minibatch_size), "--embedding_service_endpoint", str(embedding_service_endpoint), "--get_model_steps", str(args.get_model_steps), ] env_dict = parse_envs(args.envs) env = [] for key in env_dict: env.append(V1EnvVar(name=key, value=env_dict[key])) worker_manager = WorkerManager( task_d, job_name=args.job_name, image_name=args.worker_image, command=worker_command, args=worker_args, namespace=args.namespace, num_workers=args.num_workers, worker_resource_request=args.worker_resource_request, worker_resource_limit=args.worker_resource_limit, pod_priority=args.worker_pod_priority, volume=args.volume, image_pull_policy=args.image_pull_policy, restart_policy=args.restart_policy, cluster_spec=args.cluster_spec, envs=env, ) worker_manager.update_status(WorkerManagerStatus.PENDING) logger.info("Launching %d workers", args.num_workers) worker_manager.start_workers() worker_manager.update_status(WorkerManagerStatus.RUNNING) # Start TensorBoard k8s Service if requested if tb_service: TensorBoardClient( job_name=args.job_name, image_name=args.worker_image, namespace=args.namespace, ).start_tensorboard_service() try: while True: if task_d.finished(): if worker_manager: worker_manager.update_status(WorkerManagerStatus.FINISHED) if args.output: master_servicer.save_latest_checkpoint(args.output) break time.sleep(30) except KeyboardInterrupt: logger.warning("Server stopping") if evaluation_service: logger.info("Stopping evaluation service") evaluation_service.stop() logger.info("Stopping RPC server") server.stop(0) # Keep TensorBoard running when all the tasks are finished if tb_service: logger.info( "All tasks finished. Keeping TensorBoard service running...") while True: if tb_service.is_active(): time.sleep(10) else: logger.warning("Unable to keep TensorBoard running. " "It has already terminated") break logger.info("Master stopped")
def test_train_acceleration_with_embedding(self): kv_store = MockKvStore() model_inst = CustomModel() master = MasterServicer( 2, 2, tf.optimizers.SGD(0.1), None, init_var=model_inst.trainable_variables, checkpoint_filename_for_init=None, checkpoint_service=None, evaluation_service=None, ) arguments = [ "--worker_id", 1, "--job_type", JobType.TRAINING_ONLY, "--minibatch_size", 32, "--model_zoo", _model_zoo_path, "--model_def", "embedding_test_module.EdlEmbeddingModel", ] args = parse_worker_args(arguments) worker = Worker(args) worker._stub = InProcessMaster(master) inputs_list = [ { "f1": tf.constant([[0], [1], [2]], tf.int64), "f2": tf.constant([[2], [1], [0]], tf.int64), }, { "f1": tf.constant([[3], [4], [3]], tf.int64), "f2": tf.constant([[2], [1], [0]], tf.int64), }, ] labels_list = [[0, 1, 0], [1, 1, 0]] input_dim = 5 embedding_dim = 16 worker.set_model(model_inst) # initialize kv store for layer in model_inst.layers: if isinstance(layer, Embedding): name = layer.name keys = [Embedding.get_key([name, i]) for i in range(input_dim)] values = [ np.random.rand(embedding_dim).astype(np.float32) for i in range(input_dim) ] kv_store.update(keys, values) with mock.patch.object( EmbeddingService, "lookup_embedding", kv_store.lookup ), mock.patch.object( EmbeddingService, "update_embedding", kv_store.update ): worker._init_embedding_layer() worker._run_model_call_before_training(inputs_list[0]) # run training process without tf.function correct_grads = [] correct_ids_list = [] for features, labels in zip(inputs_list, labels_list): loss, grads = worker.training_process_eagerly(features, labels) correct_grads.append(grads) ids = {} for layer in worker._embedding_layers: ids[layer.name] = layer.embedding_and_ids[0].batch_ids correct_ids_list.append(ids) worker._reset_embedding() # run training process with tf.function test_grads = [] test_ids_list = [] for features, labels in zip(inputs_list, labels_list): self.assertFalse(worker._train_eagerly) loss, grads = worker.training_process(features, labels) test_grads.append(grads) ids = {} for layer in worker._embedding_layers: ids[layer.name] = copy.deepcopy( layer.embedding_and_ids[0].batch_ids ) test_ids_list.append(ids) worker._reset_embedding() # compare the gradients for test_g, correct_g in zip(test_grads, correct_grads): for g1, g2 in zip(test_g, correct_g): if isinstance(g1, tf.IndexedSlices): self.assertTrue(np.isclose(g1.values, g2.values).all()) self.assertTrue(np.isclose(g1.indices, g2.indices).all()) else: self.assertTrue(np.isclose(g1, g2).all()) for test_ids, correct_ids in zip(correct_ids_list, test_ids_list): for layer_name in correct_ids.keys(): self.assertTrue( tf.equal(test_ids[layer_name], correct_ids[layer_name]) .numpy() .all() )
class ParameterSeverModelHandlerTest(unittest.TestCase): def setUp(self): tf.keras.backend.clear_session() self.master = MasterServicer( 2, 3, None, None, init_var=[], checkpoint_filename_for_init="", checkpoint_service=CheckpointService("", 0, 0, False), evaluation_service=None, ) self.master._version = 1 self.model_handler = ModelHandler.get_model_handler( distribution_strategy="ParameterServerStrategy", stub=self.master) def test_get_model_to_train(self): model_inst = custom_model_with_embedding() model_inst = self.model_handler.get_model_to_train(model_inst) self.assertEqual(type(model_inst.layers[1]), Embedding) def test_get_model_to_export(self): model_inst = custom_model_with_embedding() trained_params = _mock_model_trained_params(model_inst) for name, value in trained_params.items(): self.master.set_model_var(name, value) train_model = self.model_handler.get_model_to_train(model_inst) export_model = self.model_handler.get_model_to_export(train_model, dataset=None) test_data = tf.constant([0]) result = export_model.call(test_data).numpy() self.assertEqual(result[0][0], 3.0) def test_get_subclass_model_to_export(self): def _get_dataset(): dataset = tf.data.Dataset.from_tensor_slices( np.random.randint(0, 10, (10, 4))) dataset = dataset.batch(2) return dataset model_inst = CustomModel() dataset = _get_dataset() trained_params = { "custom_model/embedding/embeddings:0": np.ones((4, 2), dtype="float32"), "custom_model/dense/kernel:0": np.ones((2, 1), dtype="float32"), "custom_model/dense/bias:0": np.ones((1), dtype="float32"), } for name, value in trained_params.items(): self.master.set_model_var(name, value) train_model = self.model_handler.get_model_to_train(model_inst) self.assertEqual(type(train_model.embedding), Embedding) export_model = self.model_handler.get_model_to_export(train_model, dataset=dataset) test_data = tf.constant([0]) result = export_model.call(test_data).numpy() self.assertEqual(result[0][0], 3.0)