def save_latest_checkpoint(self, output_path): if self._checkpoint_service is None: self._checkpoint_service = CheckpointService( checkpoint_dir="", checkpoint_steps=1, keep_checkpoint_max=1, include_evaluation=False, ) self._save_checkpoint(locking=False, is_eval_checkpoint=False) checkpoint_path = self._checkpoint_service.get_checkpoint_path( self._checkpoint_service.get_latest_checkpoint_version()) copy_if_not_exists(checkpoint_path, output_path, is_dir=False)
def testInitFromCheckpoint(self): init_var = m["custom_model"]().trainable_variables with tempfile.TemporaryDirectory() as tempdir: chkp_dir = os.path.join(tempdir, "testInitFromCheckpoint") os.makedirs(chkp_dir) master = MasterServicer( 2, 3, None, None, init_var=init_var, checkpoint_filename_for_init="", checkpoint_service=CheckpointService(chkp_dir, 2, 3, False), evaluation_service=None, ) req = elasticdl_pb2.GetModelRequest() req.method = elasticdl_pb2.MINIMUM req.version = 0 model = master.GetModel(req, None) master._checkpoint_service.save(master._version, model, False) chkp_file = master._checkpoint_service.get_checkpoint_path( master._version ) # Create variables from init_var, get init value from checkpoint. master2 = MasterServicer( 2, 3, None, None, init_var=init_var, checkpoint_filename_for_init=chkp_file, checkpoint_service=CheckpointService("", 0, 0, False), evaluation_service=None, ) model2 = master2.GetModel(req, None) self.assertEqual(model, model2) # Create variables from checkpoint. master3 = MasterServicer( 2, 3, None, None, init_var=[], checkpoint_filename_for_init=chkp_file, checkpoint_service=CheckpointService("", 0, 0, False), evaluation_service=None, ) model3 = master3.GetModel(req, None) self.assertEqual(model, model3)
def testNeedToCheckpoint(self): checkpointer = CheckpointService("", 0, 5, False) self.assertFalse(checkpointer.is_enabled()) checkpointer._steps = 3 self.assertTrue(checkpointer.is_enabled()) self.assertFalse(checkpointer.need_to_checkpoint(1)) self.assertFalse(checkpointer.need_to_checkpoint(2)) self.assertTrue(checkpointer.need_to_checkpoint(3)) self.assertFalse(checkpointer.need_to_checkpoint(4)) self.assertFalse(checkpointer.need_to_checkpoint(5)) self.assertTrue(checkpointer.need_to_checkpoint(6))
def testUserDefinedModel(self): master = MasterServicer( 2, 3, None, None, init_var=[], checkpoint_filename_for_init="", checkpoint_service=CheckpointService("", 0, 0, False), evaluation_service=None, ) req = elasticdl_pb2.GetModelRequest() req.method = elasticdl_pb2.MINIMUM req.version = 0 model_inst = SimpleModel() model_inst.build(SimpleModel.input_shapes()) for variable in model_inst.trainable_variables: master.set_model_var(variable.name, variable.numpy()) # Get version 0 model = master.GetModel(req, None) self.assertEqual(0, model.version) self.assertEqual( [ "dense_1/bias:0", "dense_1/kernel:0", "dense_2/bias:0", "dense_2/kernel:0", ], list(sorted(model.param.keys())), )
def testEvaluationService(self): with tempfile.TemporaryDirectory() as tempdir: chkp_dir = os.path.join(tempdir, "testEvaluationService") checkpoint_service = CheckpointService(chkp_dir, 5, 5, True) task_d = _TaskDispatcher( {"f1": (0, 10), "f2": (0, 10)}, {"f1": (0, 10), "f2": (0, 10)}, {}, 3, 1, ) # Evaluation metrics will not be accepted if no evaluation ongoing evaluation_service = EvaluationService( checkpoint_service, None, task_d, 10, 20, 0, False ) evaluation_metrics = { "mse": ndarray_to_tensor( np.array([100, 200], dtype=np.float32) ) } self.assertFalse( evaluation_service.report_evaluation_metrics( 1, evaluation_metrics ) ) # No checkpoint available self.assertFalse(evaluation_service.try_to_create_new_job()) master = MasterServicer( 2, 2, None, task_d, init_var=[], checkpoint_filename_for_init="", checkpoint_service=checkpoint_service, evaluation_service=evaluation_service, ) master.set_model_var("x", np.array([1.0, 1.0], dtype=np.float32)) # Add an evaluation task and we can start evaluation self.assertEqual(8, len(task_d._todo)) evaluation_service.add_evaluation_task(0) self.assertEqual(16, len(task_d._todo)) self.assertFalse(evaluation_service._eval_job.finished()) for i in range(8): self.assertFalse(evaluation_service._eval_job.finished()) evaluation_service.complete_task() self.assertTrue(evaluation_service._eval_job is None) self.assertFalse(evaluation_service.try_to_create_new_job())
def testSaveLoadCheckpoint(self): init_var = m["custom_model"]().trainable_variables with tempfile.TemporaryDirectory() as tempdir: chkp_dir = os.path.join(tempdir, "testSaveLoadCheckpoint") os.makedirs(chkp_dir) checkpointer = CheckpointService(chkp_dir, 3, 5, False) self.assertTrue(checkpointer.is_enabled()) master = MasterServicer( 2, 3, None, None, init_var=init_var, checkpoint_filename_for_init="", checkpoint_service=checkpointer, evaluation_service=None, ) req = elasticdl_pb2.GetModelRequest() req.method = elasticdl_pb2.MINIMUM req.version = 0 model = master.GetModel(req, None) checkpointer.save(0, model, False) loaded_model = checkpointer.get_checkpoint_model(0) self.assertEqual(model.version, loaded_model.version) for var, loaded_var in zip(model.param, loaded_model.param): self.assertEqual(var, loaded_var)
def testReportTaskResult(self): task_d = _TaskDispatcher( { "shard_1": (0, 10), "shard_2": (0, 9) }, {}, {}, records_per_task=3, num_epochs=2, ) master = MasterServicer( 3, 3, None, task_d, init_var=[], checkpoint_filename_for_init="", checkpoint_service=CheckpointService("", 0, 0, False), evaluation_service=None, ) # task to number of runs. tasks = defaultdict(int) while True: req = elasticdl_pb2.GetTaskRequest() req.worker_id = random.randint(1, 10) task = master.GetTask(req, None) if not task.shard_name: break self.assertEqual(task_d._doing[task.task_id][0], req.worker_id) task_key = (task.shard_name, task.start, task.end) tasks[task_key] += 1 report = elasticdl_pb2.ReportTaskResultRequest() report.task_id = task.task_id if task.start == 0 and tasks[task_key] == 1: # Simulate error reports. report.err_message = "Worker error" master.ReportTaskResult(report, None) self.assertDictEqual( { ("shard_1", 0, 3): 3, ("shard_1", 3, 6): 2, ("shard_1", 6, 9): 2, ("shard_1", 9, 10): 2, ("shard_2", 0, 3): 3, ("shard_2", 3, 6): 2, ("shard_2", 6, 9): 2, }, tasks, )
def _create_checkpoint_service(self, args): checkpoint_service = None if (args.checkpoint_steps or self.job_type == JobType.TRAINING_WITH_EVALUATION): self.logger.info("Creating checkpoint service") checkpoint_service = CheckpointService( args.checkpoint_dir, args.checkpoint_steps, args.keep_checkpoint_max, self.job_type == JobType.TRAINING_WITH_EVALUATION, ) return checkpoint_service
def setUp(self): tf.keras.backend.clear_session() self.master = MasterServicer( 2, 3, None, None, init_var=[], checkpoint_filename_for_init="", checkpoint_service=CheckpointService("", 0, 0, False), evaluation_service=None, ) self.master._version = 1 self.model_handler = ModelHandler.get_model_handler( distribution_strategy="ParameterServerStrategy", stub=self.master)
def test_save_parameters_to_checkpoint_file(self): with tempfile.TemporaryDirectory() as tempdir: checkpoint_service = CheckpointService( checkpoint_dir=os.path.join(tempdir, "ckpt/"), checkpoint_steps=5, keep_checkpoint_max=3, include_evaluation=False, ) pserver_servicer = PserverServicer( parameters=Parameters(), grads_to_wait=0, optimizer="optimizer", checkpoint_service=checkpoint_service, ps_id=0, num_ps_pods=1, ) model_params = { "v0": tf.Variable([[1, 1, 1], [1, 1, 1]]), "v1": tf.Variable([[2, 2, 2], [2, 2, 2]]), } server_params = pserver_servicer._parameters for var_name, var_value in model_params.items(): server_params.non_embedding_params[var_name] = var_value embedding_table = EmbeddingTable(name="embedding_0", dim=3, initializer="random_uniform") server_params.embedding_params["embedding_0"] = embedding_table server_params.set_embedding_param( name="embedding_0", indices=np.array([0, 1]), values=np.array([[1, 1, 1], [2, 2, 2]]), ) for i in range(100): pserver_servicer._parameters.version += 1 pserver_servicer._save_params_to_checkpoint_if_needed() self.assertEqual(len(os.listdir(checkpoint_service._directory)), 3) self.assertEqual( sorted(os.listdir(checkpoint_service._directory)), ["version-100", "version-90", "version-95"], ) self.assertEqual( os.listdir(checkpoint_service._directory + "/version-100"), ["variables-0-of-1.chkpt"], )
def _init_checkpoint_service(self, args): if all([ args.checkpoint_dir, args.checkpoint_steps, args.keep_checkpoint_max, ]): self.checkpoint_service = CheckpointService( args.checkpoint_dir, args.checkpoint_steps, args.keep_checkpoint_max, include_evaluation=False, ) else: self.checkpoint_service = None self.logger.warning( "Invalid checkpoint config and no model will be saved")
def testGetEmptyTask(self): master = MasterServicer( 2, 3, None, _TaskDispatcher({}, {}, {}, records_per_task=3, num_epochs=2), init_var=[], checkpoint_filename_for_init="", checkpoint_service=CheckpointService("", 0, 0, False), evaluation_service=None, ) req = elasticdl_pb2.GetTaskRequest() # No task yet, make sure the returned versions are as expected. req.worker_id = 1 task = master.GetTask(req, None) self.assertEqual("", task.shard_file_name) self.assertEqual(0, task.model_version) master._version = 1 task = master.GetTask(req, None) self.assertEqual("", task.shard_file_name) self.assertEqual(1, task.model_version)
class MasterServicer(elasticdl_pb2_grpc.MasterServicer): """Master service implementation""" def __init__( self, grads_to_wait, minibatch_size, optimizer, task_d, *, init_var, checkpoint_filename_for_init, checkpoint_service, evaluation_service, embedding_service_endpoint=None, embedding_dims={}, lr_staleness_modulation=False, use_async=False, ): # TODO: group params together into a single object. self._task_d = task_d self._lock = threading.Lock() self._gradient_sum = {} self._edl_embedding_gradients = {} self._gradient_sum_indexed = {} self._grad_to_wait = grads_to_wait self._grad_n = 0 self._minibatch_size = minibatch_size self._use_async = use_async self._lr_staleness_modulation = lr_staleness_modulation # A <string, tf.ResourceVariable> map. We use tf.ResourceVariable # instead ndarray to avoid copying and conversion when calling # optimizer's apply_gradients() function. self._model = {} self._version = 0 self._embedding_service_endpoint = embedding_service_endpoint self._init_model(checkpoint_filename_for_init, init_var) self._opt = self._init_optimizer( optimizer, embedding_service_endpoint, embedding_dims, use_async ) self._checkpoint_service = checkpoint_service self._evaluation_service = evaluation_service if evaluation_service: evaluation_service.set_master_servicer(self) # TODO: Multiple tests are currently using the function `set_model_var` to # initialize self._model, where the initialization should be done via # servicer's constructor. def set_model_var(self, name, value): """Add or set model variable. Value should be a float32 ndarray""" if value.dtype != np.float32: raise ValueError("Value should be a float32 numpy array") self._model[name] = tf.Variable( value, name=MasterServicer.var_name_encode(name) ) def _modulate_lr_if_needed(self, opt): if self._use_async and self._lr_staleness_modulation: self._lr_modulation = add_lr_modulation_to_optimizer(opt) else: self._lr_modulation = None def _init_model_from_var_list(self, var_list): for var in var_list: self.set_model_var(var.name, var.numpy()) def _init_model_from_tensor_dict(self, tensor_dict): assert tensor_dict for name, val in tensor_dict.items(): self.set_model_var(name, tensor_to_ndarray(val)) def _init_model(self, checkpoint_filename_for_init, init_var): if checkpoint_filename_for_init: pb_model = load_from_checkpoint_file(checkpoint_filename_for_init) self._version = pb_model.version self._init_model_from_tensor_dict(pb_model.param) elif init_var: self._init_model_from_var_list(init_var) else: logger.info( "Model is not intialized. It will be " "initialized by the first update from " "the worker." ) def _init_optimizer( self, opt, embedding_service_endpoint, embedding_dims, use_async ): # `embedding_service_endpoint` is not None means ElasticDL embedding # layers are used self._modulate_lr_if_needed(opt) if embedding_service_endpoint: return OptimizerWrapper( opt, embedding_service_endpoint, embedding_dims, use_async ) return opt @staticmethod def var_name_encode(name): return name.replace(":", "-") def GetTask(self, request, _): res = elasticdl_pb2.Task() res.model_version = self._version res.minibatch_size = self._minibatch_size task_id, task = self._task_d.get(request.worker_id) if task: res.task_id = task_id res.shard_name = task.shard_name res.start = task.start res.end = task.end res.type = task.type # For evaluation task, it will use the fixed version model if task.type == elasticdl_pb2.EVALUATION: res.model_version = task.model_version elif not self._task_d.finished(): # Not all tasks are finished, wait in case of new tasks later. res.type = elasticdl_pb2.WAIT return res def GetModel(self, request, _): if not self._use_async: self._validate_model_version(request.version) if ( request.method == elasticdl_pb2.MINIMUM or request.version == self._version ): if self._use_async: res = self._get_model_no_lock() else: with self._lock: res = self._get_model_no_lock() return res # Read from checkpoint for the fixed version model pb_model = elasticdl_pb2.Model() try: pb_model = self._checkpoint_service.get_checkpoint_model( request.version ) except Exception: logger.error( "Failed to fetch checkpoint model for " "model version {}".format(request.version) ) return pb_model def _update_model_version(self): assert self._lock.locked() self._version += 1 def _update_edl_embedding_table(self, name_var_list): """ Put updated embedding vectors' ids and values together and use EmbeddingService.update_embedding() to update embedding table in the distributed storage """ keys = [] embeddings = [] for layer_name, unique_ids, embedding_var in name_var_list: keys.extend( [ Embedding.get_key([layer_name, i]) for i in unique_ids.numpy() ] ) embeddings.extend([i for i in embedding_var.numpy()]) if embeddings: EmbeddingService.update_embedding( keys=keys, embedding_vectors=embeddings, embedding_service_endpoint=self._embedding_service_endpoint, ) def _update_model(self): grad_var = [] # (grad, var) pairs excluding keras Embedding layer and # ElasticDL Embedding layer for k in self._gradient_sum: if not self._use_async: self._gradient_sum[k] = ( self._gradient_sum[k] / self._grad_to_wait ) grad_var.append((self._gradient_sum[k], self._model[k])) # (grad, var) pair of Keras Embedding layer for k in self._gradient_sum_indexed: grad_var.append((self._gradient_sum_indexed[k], self._model[k])) # (grad, var) pair of ElasticDL Embedding layer if self._edl_embedding_gradients: for layer_name, grads in self._edl_embedding_gradients.items(): grad_var.append((grads, layer_name)) self._opt.apply_gradients(grad_var) # need the lock for model version update in async SGD if self._use_async: self._lock.acquire() self._update_model_version() self._update_evaluation() self._update_checkpoint() if self._use_async: self._lock.release() else: self._gradient_sum.clear() self._gradient_sum_indexed.clear() self._edl_embedding_gradients.clear() self._grad_n = 0 def get_model_version(self): return self._version def _save_checkpoint(self, locking, is_eval_checkpoint): try: logger.info( "Saving checkpoint for model version %d" % self._version ) if locking: self._lock.acquire() pb_model = self._get_model_no_lock() self._checkpoint_service.save( self._version, pb_model, is_eval_checkpoint ) checkpoint_version = self._version if locking: self._lock.release() return checkpoint_version except Exception: logger.error( "Failed to save checkpoint file for model version %d" % self._version ) def save_latest_checkpoint(self, output_path): if self._checkpoint_service is None: self._checkpoint_service = CheckpointService( checkpoint_dir="", checkpoint_steps=1, keep_checkpoint_max=1, include_evaluation=False, ) self._save_checkpoint(locking=False, is_eval_checkpoint=False) checkpoint_path = self._checkpoint_service.get_checkpoint_path( self._checkpoint_service.get_latest_checkpoint_version() ) copy_if_not_exists(checkpoint_path, output_path, is_dir=False) def _update_evaluation(self): if self._evaluation_service: self._evaluation_service.add_evaluation_task_if_needed( master_locking=False ) def _update_checkpoint(self): if ( self._checkpoint_service and self._checkpoint_service.need_to_checkpoint(self._version) ): self._save_checkpoint(locking=False, is_eval_checkpoint=False) def _get_model_no_lock(self): pb_model = elasticdl_pb2.Model() pb_model.version = self._version for k, v in self._model.items(): pb_model.param[k].CopyFrom(ndarray_to_tensor(v.numpy())) return pb_model def _validate_model_version(self, request_model_version): if request_model_version > self._version: err_msg = ( "Model version %d not available yet, " "current version: %d" % (request_model_version, self._version) ) logger.warning(err_msg) raise ValueError(err_msg) return request_model_version == self._version def ReportVariable(self, request, _): with self._lock: if not self._model: self._init_model_from_tensor_dict(request.variable) return empty_pb2.Empty() def ReportGradient(self, request, _): model_version_valid = self._use_async or self._validate_model_version( request.model_version ) res = elasticdl_pb2.ReportGradientResponse() if not model_version_valid: logger.warning( "Task result for outdated version %d dropped", request.model_version, ) res.accepted = False res.model_version = self._version return res tmp = {} indexed_grads = {} edl_embedding_gradients = {} # Do sanity check before accumulating gradients. for k, v in request.gradient.items(): if k not in self._model: if v.indices: # grads of ElasticDL Embedding layer # TODO: check arr.shape[1] = embedding_dim of this # EdlEmbedding layer arr = tensor_to_ndarray(v) edl_embedding_gradients[k] = arr continue else: raise ValueError( "Gradient key: %s is not part of model", k ) arr = tensor_to_ndarray(v) if isinstance(arr, tf.IndexedSlices): if arr.values.shape[1] != self._model[k].numpy().shape[1]: raise ValueError( "Gradient key: %s has incompatible " "indexed slice dimension %d, expected %d" % ( k, arr.values.shape[1], self._model[k].numpy().shape[1], ) ) max_index = tf.math.reduce_max(arr.indices).numpy() if max_index >= self._model[k].numpy().shape[0]: raise ValueError( "Gradient key: %s has wrong indices %d, " "out of range %d" % (k, max_index, self._model[k].numpy().shape[0] - 1) ) indexed_grads[k] = arr else: if arr.shape != self._model[k].numpy().shape: raise ValueError( "Gradient key: %s has incompatible dimension", k ) tmp[k] = arr if not self._use_async: self._lock.acquire() self._process_gradients( edl_embedding_gradients, indexed_grads, tmp, request.model_version ) if not self._use_async: self._lock.release() res.accepted = True res.model_version = self._version return res def _process_gradients( self, edl_embedding_gradients, indexed_grads, grads, request_version ): if not self._use_async: # grads of ElasticDL Embedding layer for k, v in edl_embedding_gradients.items(): if k in self._edl_embedding_gradients: self._edl_embedding_gradients[k] = merge_indexed_slices( self._edl_embedding_gradients[k], v ) else: self._edl_embedding_gradients[k] = v # grads of Keras Embedding layer for k, v in indexed_grads.items(): if k not in self._gradient_sum_indexed: self._gradient_sum_indexed[k] = v else: grads_s = self._gradient_sum_indexed[k] self._gradient_sum_indexed[k] = merge_indexed_slices( grads_s, v ) # other grads for k, v in grads.items(): if not self._use_async and k in self._gradient_sum: self._gradient_sum[k] = self._gradient_sum[k] + v else: self._gradient_sum[k] = v self._grad_n += 1 else: # TODO: do not accumulate gradients but apply directly. pass # staleness-aware learning rate modulation if self._lr_modulation: staleness = max(1, self._version - request_version) self._lr_modulation.set_multiplier(1.0 / staleness) if self._use_async or self._grad_n >= self._grad_to_wait: self._update_model() def ReportTaskResult(self, request, _): if request.err_message: logger.warning("Worker reported error: " + request.err_message) self._task_d.report(request.task_id, False) else: self._task_d.report(request.task_id, True) return empty_pb2.Empty() def ReportEvaluationMetrics(self, request, _): report_metrics = self._evaluation_service.report_evaluation_metrics( request.model_version, request.evaluation_metrics ) res = elasticdl_pb2.ReportEvaluationMetricsResponse() res.model_version = self._version res.accepted = report_metrics return res
def testGetModel(self): master = MasterServicer( 2, 3, None, None, init_var=[], checkpoint_filename_for_init="", checkpoint_service=CheckpointService("", 0, 0, False), evaluation_service=None, ) master.set_model_var("x", np.array([1.0, 1.0], dtype=np.float32)) # Now master model is version 0 self.assertEqual(0, master._version) # Get version 0 with minimum method req = elasticdl_pb2.GetModelRequest() req.version = 0 req.method = elasticdl_pb2.MINIMUM model = master.GetModel(req, None) self.assertEqual(0, model.version) self.assertEqual(["x"], list(model.param.keys())) np.testing.assert_array_equal( np.array([1.0, 1.0]), tensor_to_ndarray(model.param["x"]) ) # Increase master model version to 1, but still request # version 0 with minimum method, we should get version 1 master._version = 1 master.set_model_var("x", np.array([2.0, 2.0], dtype=np.float32)) master.set_model_var("y", np.array([12.0, 13.0], dtype=np.float32)) model = master.GetModel(req, None) self.assertEqual(1, model.version) self.assertEqual(["x", "y"], list(sorted(model.param.keys()))) np.testing.assert_array_equal( np.array([2.0, 2.0]), tensor_to_ndarray(model.param["x"]) ) np.testing.assert_array_equal( np.array([12.0, 13.0]), tensor_to_ndarray(model.param["y"]) ) # Try to get version 2, it should raise exception. req.version = 2 self.assertRaises(ValueError, master.GetModel, req, None) # Get fixed version 1 req.method = elasticdl_pb2.FIXED req.version = 1 model = master.GetModel(req, None) self.assertEqual(1, model.version) self.assertEqual(["x", "y"], list(sorted(model.param.keys()))) np.testing.assert_array_equal( np.array([2.0, 2.0]), tensor_to_ndarray(model.param["x"]) ) np.testing.assert_array_equal( np.array([12.0, 13.0]), tensor_to_ndarray(model.param["y"]) ) # Previous model unavailable due to no checkpoint req.version = 0 model = master.GetModel(req, None) self.assertFalse(model.param) # Previous model available through checkpoint with tempfile.TemporaryDirectory() as tempdir: chk_dir = os.path.join(tempdir, "testGetModel") os.makedirs(chk_dir) req.version = master._version req.method = elasticdl_pb2.MINIMUM model = master.GetModel(req, None) master._checkpoint_service = CheckpointService( chk_dir, 2, 5, False ) master._checkpoint_service.save(master._version, model, False) master._version = 2 master.set_model_var("z", np.array([2.0, 2.0], dtype=np.float32)) req.version = 1 req.method = elasticdl_pb2.FIXED model = master.GetModel(req, None) self.assertEqual(1, model.version) self.assertEqual(["x", "y"], list(sorted(model.param.keys()))) np.testing.assert_array_equal( np.array([2.0, 2.0]), tensor_to_ndarray(model.param["x"]) ) np.testing.assert_array_equal( np.array([12.0, 13.0]), tensor_to_ndarray(model.param["y"]) )
def testReportGradient(self): def makeGrad(): """ Make a ReportGradientRequest compatible with model""" req = elasticdl_pb2.ReportGradientRequest() req.gradient["x"].CopyFrom( ndarray_to_tensor(np.array([0.1], dtype=np.float32)) ) req.gradient["y"].CopyFrom( ndarray_to_tensor(np.array([0.03, 0.06], dtype=np.float32)) ) req.model_version = 1 return req master = MasterServicer( 3, 3, tf.optimizers.SGD(0.1), None, init_var=[], checkpoint_filename_for_init="", checkpoint_service=CheckpointService("", 0, 0, False), evaluation_service=None, ) master._version = 1 master.set_model_var("x", np.array([2.0], dtype=np.float32)) master.set_model_var("y", np.array([12.0, 13.0], dtype=np.float32)) # Report a future version, should raise exception req = makeGrad() req.model_version = 2 self.assertRaises(ValueError, master.ReportGradient, req, None) # Report an old version, should not be accepted req = makeGrad() req.model_version = 0 res = master.ReportGradient(req, None) self.assertFalse(res.accepted) self.assertEqual(1, res.model_version) # Report a unknown gradient, should raise. req = makeGrad() req.gradient["z"].CopyFrom( ndarray_to_tensor(np.array([0.1], dtype=np.float32)) ) self.assertRaises(ValueError, master.ReportGradient, req, None) # Report an incompatible gradient, should raise. req = makeGrad() req.gradient["y"].CopyFrom( ndarray_to_tensor(np.array([0.1], dtype=np.float32)) ) self.assertRaises(ValueError, master.ReportGradient, req, None) # Report a current version, should be accepted. req = makeGrad() res = master.ReportGradient(req, None) self.assertTrue(res.accepted) self.assertEqual(1, res.model_version) # Report a current version with part of gradients, should be accepted. req = makeGrad() del req.gradient["y"] res = master.ReportGradient(req, None) self.assertTrue(res.accepted) self.assertEqual(1, res.model_version) # Gradient should be accumulated. np.testing.assert_array_equal( np.array([0.2], dtype=np.float32), master._gradient_sum["x"] ) np.testing.assert_array_equal( np.array([0.03, 0.06], dtype=np.float32), master._gradient_sum["y"] ) self.assertEqual(2, master._grad_n) # Report a current version, should be accepted, and a new version # created req = makeGrad() res = master.ReportGradient(req, None) self.assertTrue(res.accepted) self.assertEqual(2, res.model_version) self.assertFalse(master._gradient_sum) self.assertEqual(0, master._grad_n) np.testing.assert_array_equal( # [2] - 0.1 * [0.1] np.array([1.99], dtype=np.float32), master._model["x"].numpy(), ) np.testing.assert_array_equal( # [12, 13] - 0.1 * [0.02, 0.04] np.array([11.998, 12.996], dtype=np.float32), master._model["y"].numpy(), )
def distributed_train_and_evaluate( feature_shape, model_zoo_path, model_def, model_params="", eval_metrics_fn="eval_metrics_fn", training=True, dataset_name=DatasetName.IMAGE_DEFAULT, callback_classes=[], use_async=False, get_model_steps=1, ): """Runs distributed training and evaluation with a local master. Grpc calls are mocked by local master call. Args: feature_shape: The shape of model input. model_zoo_path: The directory that contains user-defined model files or a specific model file. model_def: The import path to the model definition function/class in the model zoo, e.g. "cifar10_subclass.CustomModel". model_params: The dictionary of model parameters in a string that will be used to instantiate the model, e.g. "param1=1,param2=2". training: True for job type `TRAIN_WITH_EVALUATION`, False for job type `EVALUATION`. dataset_name: A dataset name from `DatasetName`. callback_classes: A List of callbacks that will be called at given stages of the training procedure. use_async: A python bool. True if using asynchronous updates. get_model_steps: Worker will perform `get_model` from the parameter server every this many steps. Returns: An integer indicating the model version after the distributed training and evaluation. """ job_type = (JobType.TRAINING_WITH_EVALUATION if training else JobType.EVALUATION_ONLY) batch_size = 8 if dataset_name == DatasetName.IMAGENET else 16 arguments = [ "--worker_id", "1", "--job_type", job_type, "--minibatch_size", batch_size, "--model_zoo", model_zoo_path, "--model_def", model_def, "--model_params", model_params, "--get_model_steps", get_model_steps, ] args = parse_worker_args(arguments) worker = Worker(args) if dataset_name in [DatasetName.IMAGENET, DatasetName.FRAPPE]: record_num = batch_size else: record_num = 128 shards = { create_recordio_file(record_num, dataset_name, feature_shape): ( 0, record_num, ) } if training: training_shards = shards evaluation_shards = shards else: training_shards = {} evaluation_shards = shards task_d = _TaskDispatcher( training_shards, evaluation_shards, {}, records_per_task=64, num_epochs=1, ) model_module = load_module(get_module_file_path(model_zoo_path, model_def)).__dict__ checkpoint_service = CheckpointService("", 0, 0, True) if training: evaluation_service = EvaluationService( checkpoint_service, None, task_d, 0, 0, 1, False, model_module[eval_metrics_fn], ) else: evaluation_service = EvaluationService( checkpoint_service, None, task_d, 0, 0, 0, True, model_module[eval_metrics_fn], ) task_d.set_evaluation_service(evaluation_service) grads_to_wait = 1 if use_async else 2 master = MasterServicer( grads_to_wait, batch_size, worker._opt_fn(), task_d, init_var=[], checkpoint_filename_for_init="", checkpoint_service=checkpoint_service, evaluation_service=evaluation_service, use_async=use_async, ) callbacks = [ callback_class(master, worker) for callback_class in callback_classes ] worker._stub = InProcessMaster(master, callbacks) for var in worker._model.trainable_variables: master.set_model_var(var.name, var.numpy()) worker.run() req = elasticdl_pb2.GetTaskRequest() req.worker_id = 1 task = master.GetTask(req, None) # No more task. if task.shard_name: raise RuntimeError( "There are some tasks unfinished after worker exits.") return master._version
def main(): args = parse_args() logger = get_logger("master", level=args.log_level.upper()) # Master addr master_ip = os.getenv("MY_POD_IP", "localhost") master_addr = "%s:%d" % (master_ip, args.port) # Start TensorBoard service if requested if args.tensorboard_log_dir: logger.info( "Starting TensorBoard service with log directory %s", args.tensorboard_log_dir, ) # Start TensorBoard CLI tb_service = TensorboardService(args.tensorboard_log_dir, master_ip) tb_service.start() else: tb_service = None # Start task queue logger.debug( "Starting task queue with training data directory %s, " "evaluation data directory %s, " "and prediction data directory %s", args.training_data_dir, args.evaluation_data_dir, args.prediction_data_dir, ) task_d = _make_task_dispatcher( args.training_data_dir, args.evaluation_data_dir, args.prediction_data_dir, args.records_per_task, args.num_epochs, ) model_module = load_module( get_module_file_path(args.model_zoo, args.model_def)).__dict__ model_inst = load_model_from_module(args.model_def, model_module, args.model_params) optimizer = model_module[args.optimizer]() if all(( args.training_data_dir, args.evaluation_data_dir, args.evaluation_throttle_secs or args.evaluation_steps, )): job_type = JobType.TRAINING_WITH_EVALUATION elif all(( args.evaluation_data_dir, not args.training_data_dir, not args.prediction_data_dir, )): job_type = JobType.EVALUATION_ONLY elif all(( args.prediction_data_dir, not args.evaluation_data_dir, not args.training_data_dir, )): job_type = JobType.PREDICTION_ONLY else: job_type = JobType.TRAINING_ONLY # Initialize checkpoint service if args.checkpoint_steps or job_type == JobType.TRAINING_WITH_EVALUATION: logger.info("Starting checkpoint service") checkpoint_service = CheckpointService( args.checkpoint_dir, args.checkpoint_steps, args.keep_checkpoint_max, job_type == JobType.TRAINING_WITH_EVALUATION, ) else: checkpoint_service = None # Initialize evaluation service evaluation_service = None if (job_type == JobType.TRAINING_WITH_EVALUATION or job_type == JobType.EVALUATION_ONLY): logger.info( "Starting evaluation service with throttle seconds %d " " and evaluation steps %d", args.evaluation_throttle_secs, args.evaluation_steps, ) evaluation_service = EvaluationService( checkpoint_service, tb_service, task_d, args.evaluation_start_delay_secs, args.evaluation_throttle_secs, args.evaluation_steps, job_type == JobType.EVALUATION_ONLY, ) evaluation_service.start() task_d.set_evaluation_service(evaluation_service) embedding_service_endpoint = None embedding_dims = {} # Search for embedding layers in the model, # if found, initialize embedding service layers = find_layer(model_inst, Embedding) if layers: embedding_service = EmbeddingService() embedding_service_endpoint = embedding_service.start_embedding_service( job_name=args.job_name, image_name=args.worker_image, namespace=args.namespace, resource_request=args.master_resource_request, resource_limit=args.master_resource_limit, pod_priority=args.worker_pod_priority, volume=args.volume, image_pull_policy=args.image_pull_policy, restart_policy=args.restart_policy, cluster_spec=args.cluster_spec, ) logger.info("Embedding service start succeeded. The endpoint is %s." % str(embedding_service_endpoint)) embedding_dims = dict([(layer.name, layer.output_dim) for layer in layers]) # The master service logger.info("Starting master service") server = grpc.server( futures.ThreadPoolExecutor(max_workers=64), options=[ ("grpc.max_send_message_length", GRPC.MAX_SEND_MESSAGE_LENGTH), ( "grpc.max_receive_message_length", GRPC.MAX_RECEIVE_MESSAGE_LENGTH, ), ], ) master_servicer = MasterServicer( args.grads_to_wait, args.minibatch_size, optimizer, task_d, init_var=model_inst.trainable_variables if model_inst.built else [], embedding_dims=embedding_dims, checkpoint_filename_for_init=args.checkpoint_filename_for_init, checkpoint_service=checkpoint_service, evaluation_service=evaluation_service, embedding_service_endpoint=embedding_service_endpoint, lr_staleness_modulation=args.lr_staleness_modulation, use_async=args.use_async, ) elasticdl_pb2_grpc.add_MasterServicer_to_server(master_servicer, server) server.add_insecure_port("[::]:{}".format(args.port)) server.start() logger.info("Server started at port: %d", args.port) worker_manager = None if args.num_workers: assert args.worker_image, "Worker image cannot be empty" worker_command = ["python"] worker_args = [ "-m", "elasticdl.python.worker.main", "--model_zoo", args.model_zoo, "--master_addr", master_addr, "--log_level", args.log_level, "--dataset_fn", args.dataset_fn, "--loss", args.loss, "--optimizer", args.optimizer, "--eval_metrics_fn", args.eval_metrics_fn, "--model_def", args.model_def, "--job_type", job_type, "--minibatch_size", str(args.minibatch_size), "--embedding_service_endpoint", str(embedding_service_endpoint), "--get_model_steps", str(args.get_model_steps), ] env_dict = parse_envs(args.envs) env = [] for key in env_dict: env.append(V1EnvVar(name=key, value=env_dict[key])) worker_manager = WorkerManager( task_d, job_name=args.job_name, image_name=args.worker_image, command=worker_command, args=worker_args, namespace=args.namespace, num_workers=args.num_workers, worker_resource_request=args.worker_resource_request, worker_resource_limit=args.worker_resource_limit, pod_priority=args.worker_pod_priority, volume=args.volume, image_pull_policy=args.image_pull_policy, restart_policy=args.restart_policy, cluster_spec=args.cluster_spec, envs=env, ) worker_manager.update_status(WorkerManagerStatus.PENDING) logger.info("Launching %d workers", args.num_workers) worker_manager.start_workers() worker_manager.update_status(WorkerManagerStatus.RUNNING) # Start TensorBoard k8s Service if requested if tb_service: TensorBoardClient( job_name=args.job_name, image_name=args.worker_image, namespace=args.namespace, ).start_tensorboard_service() try: while True: if task_d.finished(): if worker_manager: worker_manager.update_status(WorkerManagerStatus.FINISHED) if args.output: master_servicer.save_latest_checkpoint(args.output) break time.sleep(30) except KeyboardInterrupt: logger.warning("Server stopping") if evaluation_service: logger.info("Stopping evaluation service") evaluation_service.stop() logger.info("Stopping RPC server") server.stop(0) # Keep TensorBoard running when all the tasks are finished if tb_service: logger.info( "All tasks finished. Keeping TensorBoard service running...") while True: if tb_service.is_active(): time.sleep(10) else: logger.warning("Unable to keep TensorBoard running. " "It has already terminated") break logger.info("Master stopped")
class MasterServicer(elasticdl_pb2_grpc.MasterServicer): """Master service implementation""" def __init__( self, grads_to_wait, minibatch_size, optimizer, task_d, *, init_var, checkpoint_filename_for_init, checkpoint_service, evaluation_service, lr_staleness_modulation=False, use_async=False, ): # TODO: group params together into a single object. self._task_d = task_d self._lock = threading.Lock() self._gradient_sum = {} self._edl_embedding_gradients = {} self._gradient_sum_indexed = {} self._grad_to_wait = grads_to_wait self._grad_n = 0 self._minibatch_size = minibatch_size self._use_async = use_async self._lr_staleness_modulation = lr_staleness_modulation # A <string, tf.ResourceVariable> map. We use tf.ResourceVariable # instead of ndarray to avoid copying and conversion when calling # optimizer's apply_gradients() function. self._model = {} self._version = 0 self._init_model(checkpoint_filename_for_init, init_var) self._opt = self._init_optimizer(optimizer, use_async) self._checkpoint_service = checkpoint_service self._evaluation_service = evaluation_service if evaluation_service: evaluation_service.set_master_servicer(self) # TODO: Multiple tests are currently using the function `set_model_var` to # initialize self._model, where the initialization should be done via # servicer's constructor. def set_model_var(self, name, value): """Add or set model variable. Value should be a float32 ndarray""" if value.dtype != np.float32: raise ValueError("Value should be a float32 numpy array") self._model[name] = tf.Variable( value, name=MasterServicer.var_name_encode(name)) def _modulate_lr_if_needed(self, opt): if self._use_async and self._lr_staleness_modulation: self._lr_modulation = add_lr_modulation_to_optimizer(opt) else: self._lr_modulation = None def _init_model_from_var_list(self, var_list): for var in var_list: self.set_model_var(var.name, var.numpy()) def _init_model_from_tensor_pb_list(self, tensor_pb_list): assert tensor_pb_list for pb in tensor_pb_list: self.set_model_var(pb.name, tensor_pb_to_ndarray(pb)) def _init_model(self, checkpoint_filename_for_init, init_var): if checkpoint_filename_for_init: pb_model = load_from_checkpoint_file(checkpoint_filename_for_init) self._version = pb_model.version self._init_model_from_tensor_pb_list(pb_model.param) elif init_var: self._init_model_from_var_list(init_var) else: logger.info("Model is not intialized. It will be " "initialized by the first update from " "the worker.") # TODO: remove this function def _init_optimizer(self, opt, use_async): self._modulate_lr_if_needed(opt) if opt: return OptimizerWrapper(opt, use_async) return opt @staticmethod def var_name_encode(name): return name.replace(":", "-") def GetTask(self, request, _): res = elasticdl_pb2.Task() res.model_version = self._version res.minibatch_size = self._minibatch_size if request.task_type == elasticdl_pb2.EVALUATION: task_id, task = self._task_d.get_eval_task(request.worker_id) else: task_id, task = self._task_d.get(request.worker_id) if task: res.task_id = task_id res.shard_name = task.shard_name res.start = task.start res.end = task.end res.type = task.type for k, v in task.extended_config.items(): res.extended_config[k] = v # For evaluation task, it will use the fixed version model if task.type == elasticdl_pb2.EVALUATION: res.model_version = task.model_version elif (not self._task_d.finished()) or ( self._task_d.invoke_deferred_callback()): # If the todo and doing tasks are not empty, # Otherwise if the callback list is not empty, # we are trying to pop and invoke the callback. # Then the master tells the worker to wait # in case of new tasks later. res.type = elasticdl_pb2.WAIT return res def GetModel(self, request, _): if not self._use_async: self._validate_model_version(request.version) if (request.method == elasticdl_pb2.MINIMUM or request.version == self._version): if self._use_async: res = self._get_model_no_lock() else: with self._lock: res = self._get_model_no_lock() return res # Read from checkpoint for the fixed version model pb_model = elasticdl_pb2.Model() try: pb_model = self._checkpoint_service.get_checkpoint_model( request.version) except Exception: logger.error("Failed to fetch checkpoint model for " "model version {}".format(request.version)) return pb_model def get_model_version(self): return self._version def _save_checkpoint(self, locking, is_eval_checkpoint): try: logger.info("Saving checkpoint for model version %d" % self._version) if locking: self._lock.acquire() pb_model = self._get_model_no_lock() self._checkpoint_service.save(self._version, pb_model, is_eval_checkpoint) checkpoint_version = self._version if locking: self._lock.release() return checkpoint_version except Exception: logger.error( "Failed to save checkpoint file for model version %d" % self._version) def save_latest_checkpoint(self, output_path): if self._checkpoint_service is None: self._checkpoint_service = CheckpointService( checkpoint_dir="", checkpoint_steps=1, keep_checkpoint_max=1, include_evaluation=False, ) self._save_checkpoint(locking=False, is_eval_checkpoint=False) checkpoint_path = self._checkpoint_service.get_checkpoint_path( self._checkpoint_service.get_latest_checkpoint_version()) copy_if_not_exists(checkpoint_path, output_path, is_dir=False) def _update_evaluation(self): if self._evaluation_service: self._evaluation_service.add_evaluation_task_if_needed( master_locking=False, model_version=self._version) def _update_checkpoint(self): if (self._checkpoint_service and self._checkpoint_service.need_to_checkpoint(self._version)): self._save_checkpoint(locking=False, is_eval_checkpoint=False) def _get_model_no_lock(self): pb_model = elasticdl_pb2.Model() pb_model.version = self._version for k, v in self._model.items(): emplace_tensor_pb_from_ndarray(pb_model.param, v.numpy(), name=k) return pb_model def _validate_model_version(self, request_model_version): if request_model_version > self._version: err_msg = ("Model version %d not available yet, " "current version: %d" % (request_model_version, self._version)) logger.warning(err_msg) raise ValueError(err_msg) return request_model_version == self._version def ReportVariable(self, request, _): with self._lock: if not self._model: self._init_model_from_tensor_pb_list(request.variable) return empty_pb2.Empty() def ReportTaskResult(self, request, _): if request.err_message: logger.warning("Worker reported error: " + request.err_message) self._task_d.report(request, False) else: self._task_d.report(request, True) return empty_pb2.Empty() def ReportEvaluationMetrics(self, request, _): report_metrics = self._evaluation_service.report_evaluation_metrics( request.model_version, request.model_outputs, request.labels) res = elasticdl_pb2.ReportEvaluationMetricsResponse() res.model_version = self._version res.accepted = report_metrics return res def ReportVersion(self, request, _): if self._evaluation_service: self._evaluation_service.add_evaluation_task_if_needed( master_locking=False, model_version=request.model_version) return empty_pb2.Empty()
def testMaxCheckpointVersions(self): with tempfile.TemporaryDirectory() as tempdir: chkp_dir = os.path.join(tempdir, "testMaxCheckpointVersions") os.makedirs(chkp_dir) # Save checkpoints every 2 steps, and keep 5 checkpoints at most checkpointer = CheckpointService(chkp_dir, 2, 5, False) self.assertTrue(checkpointer.is_enabled()) batch_size = 2 # Launch the training arguments = [ "--worker_id", 1, "--job_type", JobType.TRAINING_ONLY, "--minibatch_size", batch_size, "--model_zoo", _model_zoo_path, "--model_def", "test_module.custom_model", ] args = parse_worker_args(arguments) worker = Worker(args) filename = create_recordio_file(128, DatasetName.TEST_MODULE, 1) task_d = _TaskDispatcher({filename: (0, 128)}, {}, {}, records_per_task=64, num_epochs=1) master = MasterServicer( 2, batch_size, worker._opt_fn(), task_d, init_var=worker._model.trainable_variables, checkpoint_filename_for_init="", checkpoint_service=checkpointer, evaluation_service=None, ) worker._stub = InProcessMaster(master) worker.run() # We should have 5 checkpoints when the training finishes checkpoint_files = sorted(os.listdir(checkpointer._directory)) self.assertEqual( checkpoint_files, [ "model_v24.chkpt", "model_v26.chkpt", "model_v28.chkpt", "model_v30.chkpt", "model_v32.chkpt", ], ) # Latest version should be 32 self.assertEqual(32, checkpointer.get_latest_checkpoint_version()) # Check all checkpoints for version in [24, 26, 28, 30, 32]: model = checkpointer.get_checkpoint_model(version) self.assertEqual(version, model.version) # Checkpoint not found self.assertRaisesRegex( RuntimeError, "Failed to read model checkpoint from file", checkpointer.get_checkpoint_model, 100, )
def distributed_train_and_evaluate( self, feature_shape, model_def, model_params="", training=True, dataset="", ): """ Run distributed training and evaluation with a local master. grpc calls are mocked by local master call. """ job_type = (JobType.TRAINING_ONLY if training else JobType.EVALUATION_ONLY) batch_size = 16 worker = Worker( 1, job_type, batch_size, _model_zoo_path, model_def=model_def, model_params=model_params, channel=None, ) if dataset == "imagenet": batch_size = 8 shards = {create_imagenet_recordio_file(8, feature_shape): (0, 8)} elif dataset == "frappe": shards = { create_frappe_recordio_file(16, feature_shape, 5383): (0, 16) } else: shards = {create_recordio_file(128, feature_shape): (0, 128)} if training: training_shards = shards evaluation_shards = shards else: training_shards = {} evaluation_shards = shards task_d = _TaskDispatcher( training_shards, evaluation_shards, {}, records_per_task=64, num_epochs=1, ) # Initialize checkpoint service checkpoint_service = CheckpointService("", 0, 0, True) if training: evaluation_service = EvaluationService(checkpoint_service, None, task_d, 0, 0, 1, False) else: evaluation_service = EvaluationService(checkpoint_service, None, task_d, 0, 0, 0, True) task_d.set_evaluation_service(evaluation_service) # The master service master = MasterServicer( 2, batch_size, worker._opt_fn(), task_d, init_var=[], checkpoint_filename_for_init="", checkpoint_service=checkpoint_service, evaluation_service=evaluation_service, ) worker._stub = InProcessMaster(master) for var in worker._model.trainable_variables: master.set_model_var(var.name, var.numpy()) worker.run() req = elasticdl_pb2.GetTaskRequest() req.worker_id = 1 task = master.GetTask(req, None) # No more task. self.assertTrue(not task.shard_name)