def __init__(self, args): self.logger = get_logger("PS", level=args.log_level.upper()) self.grads_to_wait = args.grads_to_wait self.lr_staleness_modulation = args.lr_staleness_modulation self.sync_version_tolerance = args.sync_version_tolerance self.use_async = args.use_async self.port = args.port model_module = load_module( get_module_file_path(args.model_zoo, args.model_def)).__dict__ self.optimizer = model_module[args.optimizer]() self._set_lr_scheduler(model_module, args.learning_rate_scheduler) self.ps_id = args.ps_id self.num_ps_pods = args.num_ps_pods self.num_workers = args.num_workers # Create Parameters instance self.parameters = Parameters() if args.master_addr is None: raise ValueError("master_addr is missing for parameter servers") self.master_channel = build_channel(args.master_addr) self.evaluation_steps = args.evaluation_steps self.master_name = get_master_pod_name(args.job_name) self.namespace = args.namespace self._init_checkpoint_saver(args) self._restore_params_from_checkpoint(args.checkpoint_dir_for_init) self._debug_info_needed = args.log_level.upper() == "DEBUG"
def __init__(self, args, task_manager, rendezvous_server=None): self.logger = get_logger("master", level=args.log_level.upper()) self.num_ps_pods = args.num_ps_pods self.checkpoint_output_path = args.checkpoint_dir # Master addr master_ip = os.getenv("MY_POD_IP", "localhost") self.master_addr = "%s:%d" % (master_ip, args.port) self.job_type = get_job_type(args) # Initialize the components from the model definition model_module = load_module( get_module_file_path(args.model_zoo, args.model_def)).__dict__ self._optimizer = model_module[args.optimizer]() # TODO: Remove task manage and rendezvous server after # refactoring pod manager. self.task_manager = task_manager self.rendezvous_server = rendezvous_server self.evaluation_service = ( None if args.eval_metrics_fn not in model_module else self._create_evaluation_service( model_module[args.eval_metrics_fn], args.evaluation_steps))
def test_odps_data_reader_integration_with_local_keras(self): num_records = 2 model_spec = load_module( os.path.join( os.path.dirname(os.path.realpath(__file__)), "../../../model_zoo", "odps_iris_dnn_model/odps_iris_dnn_model.py", )).__dict__ model = model_spec["custom_model"]() optimizer = model_spec["optimizer"]() loss = model_spec["loss"] dataset_fn = model_spec["dataset_fn"] def _gen(): for data in self.reader.read_records( _MockedTask(0, num_records, self.test_table + ":shard_0")): if data is not None: yield data dataset = tf.data.Dataset.from_generator(_gen, tf.float32) dataset = dataset_fn(dataset, None) loss_history = [] grads = None for features, labels in dataset: with tf.GradientTape() as tape: logits = model(features, training=True) loss_value = loss(logits, labels) loss_history.append(loss_value.numpy()) grads = tape.gradient(loss_value, model.trainable_variables) optimizer.apply_gradients(zip(grads, model.trainable_variables)) self.assertEqual(len(loss_history), num_records) self.assertEqual(len(grads), num_records) self.assertEqual(len(model.trainable_variables), num_records)
def _test_correctness(self, optimizer_class, X, Y, seed, **kwargs): """Test the correctness of specific TensorFlow optimizer.""" _model_file = get_module_file_path( os.path.dirname(os.path.realpath(__file__)), "embedding_test_module.KerasEmbeddingModel", ) model_module = load_module(_model_file).__dict__ # train model with TensorFlow optimizer weights = self._random_init_model_weight([(4, 4), (4, 4), (72, 1), (1, )], seed) loss_fn = model_module["loss"] model1 = model_module["KerasEmbeddingModel"](4, 4, weights) opt1 = optimizer_class(**kwargs) _train(model1, opt1, X, Y, loss_fn, random_seed=seed) model2 = model_module["EdlEmbeddingModel"](4, weights[2:]) opt2 = optimizer_class(**kwargs) layer_names = [layer.name for layer in find_layer(model2, Embedding)] embed_dims = dict([(layer_name, 4) for layer_name in layer_names]) # intialize embedding vectors in kv store mock_kv_store = MockKvStore({}) for layer, embed_table in zip(layer_names, weights[:2]): for i, embed_vector in enumerate(embed_table): mock_kv_store.update(["%s-%d" % (layer, i)], [embed_vector]) # train model with optimizer wrapper with mock.patch.object(EmbeddingService, "lookup_embedding", mock_kv_store.lookup), mock.patch.object( EmbeddingService, "update_embedding", mock_kv_store.update): _train_edl_embedding_with_optimizer_wrapper(model2, opt2, X, Y, loss_fn, embed_dims, random_seed=seed) # compare trained parameters wrong_msg = ( "The updated parameters of Optimizer Wrapper and TensorFlow " "optimizer %s differ." % opt1.get_config()["name"]) for layer1, layer2 in zip(model1.layers, model2.layers): if "embedding" in layer2.name: w1 = layer1.weights[0].numpy() keys = [Embedding.get_key([layer2.name, i]) for i in range(4)] w2 = np.concatenate(mock_kv_store.lookup(keys)[0]).reshape( 4, -1) self.assertTrue(np.isclose(w1, w2).all(), msg=wrong_msg) else: for w1, w2 in zip(layer1.weights, layer2.weights): self.assertTrue(np.isclose(w1.numpy(), w2.numpy()).all(), msg=wrong_msg)
def _test_correctness(self, optimizer_class, X, Y, seed, **opt_kwargs): """Test the correctness of specific TensorFlow optimizer.""" _model_file = get_module_file_path( os.path.dirname(os.path.realpath(__file__)), "embedding_test_module.KerasEmbeddingModel", ) model_module = load_module(_model_file).__dict__ # train model with TensorFlow optimizer dim = 4 weights = self._random_init_model_weight([(4, dim), (4, dim), (72, 1), (1, )], seed) loss_fn = model_module["loss"] model1 = model_module["KerasEmbeddingModel"](4, dim, weights) opt1 = optimizer_class(**opt_kwargs) _train(model1, opt1, X, Y, loss_fn, random_seed=seed) model2 = model_module["EdlEmbeddingModel"](dim, weights[2:]) opt2 = optimizer_class(**opt_kwargs) embedding_weight_names = [ layer.embedding_weight_name for layer in find_layer(model2, Embedding) ] # create Parameters object and initialize embedding vectors params = Parameters() for weight_name, embed_value in zip(embedding_weight_names, weights[:2]): embed_table = EmbeddingTable(weight_name, dim) embed_table.set(range(len(embed_value)), embed_value) params.embedding_params[weight_name] = embed_table _train_edl_embedding_with_optimizer_wrapper(model2, opt2, X, Y, loss_fn, params, random_seed=seed) # compare trained parameters wrong_msg = ( "The updated parameters of Optimizer Wrapper and TensorFlow " "optimizer %s differ." % opt1.get_config()["name"]) for layer1, layer2 in zip(model1.layers, model2.layers): if "embedding" in layer2.name: w1 = layer1.weights[0].numpy() w2 = params.get_embedding_param(layer2.embedding_weight_name, range(4)) self.assertTrue(np.isclose(w1, w2).all(), msg=wrong_msg) else: for w1, w2 in zip(layer1.weights, layer2.weights): self.assertTrue(np.isclose(w1.numpy(), w2.numpy()).all(), msg=wrong_msg)
def _load_data_reader_fn(self, args): self._create_data_reader_fn = create_data_reader if args.model_zoo: # Initialize the components from the model definition model_module = load_module( get_module_file_path(args.model_zoo, args.model_def)).__dict__ if args.custom_data_reader in model_module: self._create_data_reader_fn = model_module[ args.custom_data_reader]
def main(): parser = argparse.ArgumentParser( description="Spark job to convert training data to RecordIO format") parser.add_argument( "--training_data_tar_file", help="Tar file that contains all training data", required=True, ) parser.add_argument("--output_dir", help="Directory of output RecordIO data", required=True) parser.add_argument( "--model_file", required=True, help="User-defined model file which data processing logic is in", ) parser.add_argument("--records_per_file", default=1024, type=int, help="Record per file") parser.add_argument( "--num_workers", default=2, type=int, help="Number of workers of Spark job", ) args = parser.parse_args() # Get training data file names from training_data_tar_file tar = tarfile.open(args.training_data_tar_file) tar_info_list = tar.getmembers() filename_list = [] for tar_info in tar_info_list: f = tar.extractfile(tar_info) if f is not None and not tar_info.name.split("/")[-1].startswith("."): filename_list.append(tar_info.name) # Load user-defined model model_module = load_module(args.model_file) if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) # Start the Spark job sc = SparkContext() rdd = sc.parallelize(filename_list, args.num_workers) rdd.mapPartitions( process_data( model_module.prepare_data_for_a_single_file, args.training_data_tar_file, args.output_dir, args.records_per_file, )).collect()
def __init__(self, *, image_name, namespace, job_name, event_callback=None, cluster_spec="", force_use_kube_config_file=False): """ ElasticDL k8s client. Args: image_name: Docker image path for ElasticDL pod. namespace: The name of the Kubernetes namespace where ElasticDL pods will be created. job_name: ElasticDL job name, should be unique in the namespace. Used as pod name prefix and value for "elasticdl" label. event_callback: If not None, an event watcher will be created and events passed to the callback. force_use_kube_config_file: If true, force to load the cluster config from ~/.kube/config. Otherwise, if it's in a process running in a K8S environment, it loads the incluster config, if not, it loads the kube config file. """ try: if (os.getenv("KUBERNETES_SERVICE_HOST") and not force_use_kube_config_file): # We are running inside a k8s cluster config.load_incluster_config() logger.info("Load the incluster config.") else: # Use user's kube config config.load_kube_config() logger.info("Load the kube config file.") except Exception as ex: traceback.print_exc() raise Exception( "Failed to load configuration for Kubernetes:\n%s" % str(ex)) self.client = client.CoreV1Api() self.namespace = namespace self.job_name = job_name self._image_name = image_name self._event_cb = event_callback if self._event_cb: threading.Thread(target=self._watch, name="event_watcher", daemon=True).start() self.cluster = None if cluster_spec: cluster_spec_module = load_module(cluster_spec) self.cluster = cluster_spec_module.cluster
def __init__(self, args): self.logger = get_logger("PS", level=args.log_level.upper()) self.grads_to_wait = args.grads_to_wait self.lr_staleness_modulation = args.lr_staleness_modulation self.use_async = args.use_async self.port = args.port model_module = load_module( get_module_file_path(args.model_zoo, args.model_def) ).__dict__ self.optimizer = model_module[args.optimizer]() # Create Parameters instance self.parameters = Parameters()
def test_odps_data_reader_integration_with_local_keras(self): num_records = 2 model_spec = load_module( os.path.join( os.path.dirname(os.path.realpath(__file__)), "../../../model_zoo", "odps_iris_dnn_model/odps_iris_dnn_model.py", ) ).__dict__ model = model_spec["custom_model"]() optimizer = model_spec["optimizer"]() loss = model_spec["loss"] reader = create_data_reader( data_origin=self.test_table, records_per_task=10, **{"columns": IRIS_TABLE_COLUMN_NAMES, "label_col": "class"} ) dataset_fn = reader.default_dataset_fn() def _gen(): for data in self.reader.read_records( _MockedTask( 0, num_records, self.test_table + ":shard_0", elasticdl_pb2.TRAINING, ) ): if data is not None: yield data dataset = tf.data.Dataset.from_generator(_gen, tf.string) dataset = dataset_fn( dataset, None, Metadata(column_names=IRIS_TABLE_COLUMN_NAMES) ) dataset = dataset.batch(1) loss_history = [] grads = None for features, labels in dataset: with tf.GradientTape() as tape: logits = model(features, training=True) loss_value = loss(labels, logits) loss_history.append(loss_value.numpy()) grads = tape.gradient(loss_value, model.trainable_variables) optimizer.apply_gradients(zip(grads, model.trainable_variables)) self.assertEqual(len(loss_history), num_records) self.assertEqual(len(grads), num_records) self.assertEqual(len(model.trainable_variables), num_records)
def __init__(self, *, image_name, namespace, job_name, event_callback=None, cluster_spec=""): """ ElasticDL k8s client. Args: image_name: Docker image path for ElasticDL pod. namespace: The name of the Kubernetes namespace where ElasticDL pods will be created. job_name: ElasticDL job name, should be unique in the namespace. Used as pod name prefix and value for "elasticdl" label. event_callback: If not None, an event watcher will be created and events passed to the callback. """ if os.getenv("KUBERNETES_SERVICE_HOST"): # We are running inside k8s config.load_incluster_config() else: # Use user's kube config config.load_kube_config() self.client = client.CoreV1Api() self.namespace = namespace self.job_name = job_name self._image_name = image_name self._event_cb = event_callback if self._event_cb: threading.Thread(target=self._watch, name="event_watcher", daemon=True).start() self.cluster = None if cluster_spec: cluster_spec_module = load_module(cluster_spec) self.cluster = cluster_spec_module.cluster
def __init__(self, args): self.logger = get_logger("PS", level=args.log_level.upper()) self.grads_to_wait = args.grads_to_wait self.lr_staleness_modulation = args.lr_staleness_modulation self.use_async = args.use_async self.port = args.port model_module = load_module( get_module_file_path(args.model_zoo, args.model_def)).__dict__ self.optimizer = model_module[args.optimizer]() self.ps_id = args.ps_id self.num_ps_pods = args.num_ps_pods # Create Parameters instance self.parameters = Parameters() if args.master_addr is None: raise ValueError("master_addr is missing for parameter servers") self.master_channel = build_channel(args.master_addr) self.evaluation_steps = args.evaluation_steps self.master_name = get_master_pod_name(args.job_name) self.namespace = args.namespace self._init_checkpoint_service(args)
def distributed_train_and_evaluate( feature_shape, model_zoo_path, model_def, model_params="", eval_metrics_fn="eval_metrics_fn", training=True, dataset_name=DatasetName.IMAGE_DEFAULT, callback_classes=[], use_async=False, get_model_steps=1, ): """Runs distributed training and evaluation with a local master. Grpc calls are mocked by local master call. Args: feature_shape: The shape of model input. model_zoo_path: The directory that contains user-defined model files or a specific model file. model_def: The import path to the model definition function/class in the model zoo, e.g. "cifar10_subclass.CustomModel". model_params: The dictionary of model parameters in a string that will be used to instantiate the model, e.g. "param1=1,param2=2". training: True for job type `TRAIN_WITH_EVALUATION`, False for job type `EVALUATION`. dataset_name: A dataset name from `DatasetName`. callback_classes: A List of callbacks that will be called at given stages of the training procedure. use_async: A python bool. True if using asynchronous updates. get_model_steps: Worker will perform `get_model` from the parameter server every this many steps. Returns: An integer indicating the model version after the distributed training and evaluation. """ job_type = (JobType.TRAINING_WITH_EVALUATION if training else JobType.EVALUATION_ONLY) batch_size = 8 if dataset_name == DatasetName.IMAGENET else 16 arguments = [ "--worker_id", "1", "--job_type", job_type, "--minibatch_size", batch_size, "--model_zoo", model_zoo_path, "--model_def", model_def, "--model_params", model_params, "--get_model_steps", get_model_steps, ] args = parse_worker_args(arguments) worker = Worker(args) if dataset_name in [DatasetName.IMAGENET, DatasetName.FRAPPE]: record_num = batch_size else: record_num = 128 shards = { create_recordio_file(record_num, dataset_name, feature_shape): ( 0, record_num, ) } if training: training_shards = shards evaluation_shards = shards else: training_shards = {} evaluation_shards = shards task_d = _TaskDispatcher( training_shards, evaluation_shards, {}, records_per_task=64, num_epochs=1, ) model_module = load_module(get_module_file_path(model_zoo_path, model_def)).__dict__ checkpoint_service = CheckpointService("", 0, 0, True) if training: evaluation_service = EvaluationService( checkpoint_service, None, task_d, 0, 0, 1, False, model_module[eval_metrics_fn], ) else: evaluation_service = EvaluationService( checkpoint_service, None, task_d, 0, 0, 0, True, model_module[eval_metrics_fn], ) task_d.set_evaluation_service(evaluation_service) grads_to_wait = 1 if use_async else 2 master = MasterServicer( grads_to_wait, batch_size, worker._opt_fn(), task_d, init_var=[], checkpoint_filename_for_init="", checkpoint_service=checkpoint_service, evaluation_service=evaluation_service, use_async=use_async, ) callbacks = [ callback_class(master, worker) for callback_class in callback_classes ] worker._stub = InProcessMaster(master, callbacks) for var in worker._model.trainable_variables: master.set_model_var(var.name, var.numpy()) worker.run() req = elasticdl_pb2.GetTaskRequest() req.worker_id = 1 task = master.GetTask(req, None) # No more task. if task.shard_name: raise RuntimeError( "There are some tasks unfinished after worker exits.") return master._version
def test_push_model(self): opt_func_name = "ftrl_optimizer" opt = load_module(_module_file).__dict__[opt_func_name]() opt_config = opt.get_config() slot_names = ["accumulator", "linear"] slot_init_value = { "accumulator": opt_config["initial_accumulator_value"], "linear": 0.0, } self.create_default_server_and_stub(optimizer=opt_func_name) param0 = { "v0": np.random.rand(3, 2).astype(np.float32), "v1": np.random.rand(10, 32).astype(np.float32), } param1 = { "v0": np.ones([3, 2], dtype=np.float32), "v1": np.ones([10, 32], dtype=np.float32), } models = [param0, param1] for idx, model in enumerate(models): req = elasticdl_pb2.Model() req.version = idx + 1 for name in model: serialize_ndarray(model[name], req.dense_parameters[name]) req.embedding_table_infos.append(self._embedding_info) res = self._stub.push_model(req) self.assertEqual(res, empty_pb2.Empty()) # self._parameters is initialized with the first push_model call # and the second push_model has no effect self.assertEqual(self._parameters.version, 1) for name in param0: self.assertTrue( np.allclose( param0[name], self._parameters.non_embedding_params[name].numpy(), ) ) self.assertEqual( self._embedding_info.name, self._parameters.embedding_params[ self._embedding_info.name ].name, ) self.assertEqual( self._embedding_info.dim, self._parameters.embedding_params[ self._embedding_info.name ].dim, ) self.assertEqual( tf.keras.initializers.get( self._embedding_info.initializer ).__class__, self._parameters.embedding_params[ self._embedding_info.name ].initializer.__class__, ) for slot_name in slot_names: name = get_slot_table_name( self._embedding_info.name, slot_name ) table = self._parameters.embedding_params[name] self.assertTrue(name, table.name) self.assertTrue(self._embedding_info.dim, table.dim) embedding = table.get([2]) self.assertTrue( (embedding - slot_init_value[slot_name] < 0.0001).all() )
def main(): args = parse_master_args() logger = get_logger("master", level=args.log_level.upper()) # Master addr master_ip = os.getenv("MY_POD_IP", "localhost") master_addr = "%s:%d" % (master_ip, args.port) # Start TensorBoard service if requested if args.tensorboard_log_dir: logger.info( "Starting TensorBoard service with log directory %s", args.tensorboard_log_dir, ) # Start TensorBoard CLI tb_service = TensorboardService(args.tensorboard_log_dir, master_ip) tb_service.start() else: tb_service = None # Start task queue logger.debug( "Starting task queue with training data directory %s, " "evaluation data directory %s, " "and prediction data directory %s", args.training_data_dir, args.evaluation_data_dir, args.prediction_data_dir, ) records_per_task = args.minibatch_size * args.num_minibatches_per_task task_d = _make_task_dispatcher( args.training_data_dir, args.evaluation_data_dir, args.prediction_data_dir, records_per_task, args.num_epochs, ) model_module = load_module( get_module_file_path(args.model_zoo, args.model_def) ).__dict__ model_inst = load_model_from_module( args.model_def, model_module, args.model_params ) optimizer = model_module[args.optimizer]() if all( ( args.training_data_dir, args.evaluation_data_dir, args.evaluation_throttle_secs or args.evaluation_steps, ) ): job_type = JobType.TRAINING_WITH_EVALUATION elif all( ( args.evaluation_data_dir, not args.training_data_dir, not args.prediction_data_dir, ) ): job_type = JobType.EVALUATION_ONLY elif all( ( args.prediction_data_dir, not args.evaluation_data_dir, not args.training_data_dir, ) ): job_type = JobType.PREDICTION_ONLY else: job_type = JobType.TRAINING_ONLY # Initialize checkpoint service if args.checkpoint_steps or job_type == JobType.TRAINING_WITH_EVALUATION: logger.info("Starting checkpoint service") checkpoint_service = CheckpointService( args.checkpoint_dir, args.checkpoint_steps, args.keep_checkpoint_max, job_type == JobType.TRAINING_WITH_EVALUATION, ) else: checkpoint_service = None # Initialize evaluation service evaluation_service = None if ( job_type == JobType.TRAINING_WITH_EVALUATION or job_type == JobType.EVALUATION_ONLY ): logger.info( "Starting evaluation service with throttle seconds %d " " and evaluation steps %d", args.evaluation_throttle_secs, args.evaluation_steps, ) evaluation_service = EvaluationService( checkpoint_service, tb_service, task_d, args.evaluation_start_delay_secs, args.evaluation_throttle_secs, args.evaluation_steps, job_type == JobType.EVALUATION_ONLY, ) evaluation_service.start() task_d.set_evaluation_service(evaluation_service) embedding_service_endpoint = None embedding_dims = {} # Search for embedding layers in the model, # if found, initialize embedding service layers = find_layer(model_inst, Embedding) if layers: embedding_service = EmbeddingService() embedding_service_endpoint = embedding_service.start_embedding_service( job_name=args.job_name, image_name=args.worker_image, namespace=args.namespace, resource_request=args.master_resource_request, resource_limit=args.master_resource_limit, pod_priority=args.worker_pod_priority, volume=args.volume, image_pull_policy=args.image_pull_policy, restart_policy=args.restart_policy, cluster_spec=args.cluster_spec, ) logger.info( "Embedding service start succeeded. The endpoint is %s." % str(embedding_service_endpoint) ) embedding_dims = dict( [(layer.name, layer.output_dim) for layer in layers] ) # The master service logger.info("Starting master service") server = grpc.server( futures.ThreadPoolExecutor(max_workers=64), options=[ ("grpc.max_send_message_length", GRPC.MAX_SEND_MESSAGE_LENGTH), ( "grpc.max_receive_message_length", GRPC.MAX_RECEIVE_MESSAGE_LENGTH, ), ], ) master_servicer = MasterServicer( args.grads_to_wait, args.minibatch_size, optimizer, task_d, init_var=model_inst.trainable_variables if model_inst.built else [], embedding_dims=embedding_dims, checkpoint_filename_for_init=args.checkpoint_filename_for_init, checkpoint_service=checkpoint_service, evaluation_service=evaluation_service, embedding_service_endpoint=embedding_service_endpoint, lr_staleness_modulation=args.lr_staleness_modulation, use_async=args.use_async, ) elasticdl_pb2_grpc.add_MasterServicer_to_server(master_servicer, server) server.add_insecure_port("[::]:{}".format(args.port)) server.start() logger.info("Server started at port: %d", args.port) worker_manager = None if args.num_workers: assert args.worker_image, "Worker image cannot be empty" worker_command = ["python"] worker_args = [ "-m", "elasticdl.python.worker.main", "--master_addr", master_addr, "--job_type", job_type, "--embedding_service_endpoint", str(embedding_service_endpoint), ] worker_args.extend(build_arguments_from_parsed_result(args)) env_dict = parse_envs(args.envs) env = [] for key in env_dict: env.append(V1EnvVar(name=key, value=env_dict[key])) worker_manager = WorkerManager( task_d, job_name=args.job_name, image_name=args.worker_image, command=worker_command, args=worker_args, namespace=args.namespace, num_workers=args.num_workers, worker_resource_request=args.worker_resource_request, worker_resource_limit=args.worker_resource_limit, pod_priority=args.worker_pod_priority, volume=args.volume, image_pull_policy=args.image_pull_policy, restart_policy=args.restart_policy, cluster_spec=args.cluster_spec, envs=env, ) worker_manager.update_status(WorkerManagerStatus.PENDING) logger.info("Launching %d workers", args.num_workers) worker_manager.start_workers() worker_manager.update_status(WorkerManagerStatus.RUNNING) # Start TensorBoard k8s Service if requested if tb_service: TensorBoardClient( job_name=args.job_name, image_name=args.worker_image, namespace=args.namespace, ).start_tensorboard_service() try: while True: if task_d.finished(): if worker_manager: worker_manager.update_status(WorkerManagerStatus.FINISHED) if args.output: master_servicer.save_latest_checkpoint(args.output) break time.sleep(30) except KeyboardInterrupt: logger.warning("Server stopping") if evaluation_service: logger.info("Stopping evaluation service") evaluation_service.stop() logger.info("Stopping RPC server") server.stop(0) # Keep TensorBoard running when all the tasks are finished if tb_service: logger.info( "All tasks finished. Keeping TensorBoard service running..." ) while True: if tb_service.is_active(): time.sleep(10) else: logger.warning( "Unable to keep TensorBoard running. " "It has already terminated" ) break logger.info("Master stopped")
def __init__(self, args): self.logger = get_logger("master", level=args.log_level.upper()) self.num_ps_pods = args.num_ps_pods self.checkpoint_output_path = args.checkpoint_dir # Master addr master_ip = os.getenv("MY_POD_IP", "localhost") self.master_addr = "%s:%d" % (master_ip, args.port) self.job_type = Master._get_job_type(args) # Initialize TensorBoard service if requested self.tb_service = self._create_tensorboard_service( args.tensorboard_log_dir, master_ip) if self.tb_service: self.tb_client = TensorBoardClient( job_name=args.job_name, image_name=args.worker_image, namespace=args.namespace, ) # Initialize the components from the model definition self.model_module = load_module( get_module_file_path(args.model_zoo, args.model_def)).__dict__ self.model_inst = load_model_from_module(args.model_def, self.model_module, args.model_params) model_handler = ModelHandler.get_model_handler( args.distribution_strategy, checkpoint_dir=args.checkpoint_dir) self.model_inst = model_handler.get_model_to_train(self.model_inst) self.optimizer = self.model_module[args.optimizer]() self._create_data_reader_fn = create_data_reader if args.custom_data_reader in self.model_module: self._create_data_reader_fn = self.model_module[ args.custom_data_reader] # Start task queue records_per_task = args.minibatch_size * args.num_minibatches_per_task self.task_d = _make_task_dispatcher( args.training_data, args.validation_data, args.prediction_data, records_per_task, args.num_epochs, args.data_reader_params, self._create_data_reader_fn, ) saved_model_path = args.output if saved_model_path is not None and self.job_type in [ JobType.TRAINING_ONLY, JobType.TRAINING_WITH_EVALUATION, ]: self.task_d.add_deferred_callback_create_save_model_task( saved_model_path) self.evaluation_service = self._create_evaluation_service(args) # Initialize master service self.master_servicer, self.server = self._create_master_service(args) # Initialize instance manager self.instance_manager = self._create_instance_manager(args) self._should_stop = False self._exit_code = 0
def distributed_train_and_evaluate( feature_shape, model_zoo_path, model_def, model_params="", eval_metrics_fn="eval_metrics_fn", loss="loss", training=True, dataset_name=DatasetName.IMAGE_DEFAULT, use_async=False, get_model_steps=1, ps_channels=None, pservers=None, distribution_strategy=DistributionStrategy.PARAMETER_SERVER, ): """Runs distributed training and evaluation with a local master. Grpc calls are mocked by local master call. Args: feature_shape: The shape of model input. model_zoo_path: The directory that contains user-defined model files or a specific model file. model_def: The import path to the model definition function/class in the model zoo, e.g. "cifar10_subclass.CustomModel". model_params: The dictionary of model parameters in a string that will be used to instantiate the model, e.g. "param1=1,param2=2". eval_metrics_fn: The name of the evaluation metrics function defined in the model file. loss: The name of the loss function defined in the model file. training: True for job type `TRAIN_WITH_EVALUATION`, False for job type `EVALUATION`. dataset_name: A dataset name from `DatasetName`. use_async: A bool. True if using asynchronous updates. get_model_steps: Worker will perform `get_model` from the parameter server every this many steps. ps_channels: A channel list to all parameter server pods. pservers: A list of parameter server pods. distribution_strategy: The distribution startegy used by workers, e.g. DistributionStrategy.PARAMETER_SERVER or DistributionStrategy.AllreduceStrategy. Returns: An integer indicating the model version after the distributed training and evaluation. """ job_type = (JobType.TRAINING_WITH_EVALUATION if training else JobType.EVALUATION_ONLY) evaluation_steps = 1 if job_type == JobType.TRAINING_WITH_EVALUATION else 0 batch_size = 8 if dataset_name == DatasetName.IMAGENET else 16 pservers = pservers or [] ps_channels = ps_channels or [] model_module = load_module(get_module_file_path(model_zoo_path, model_def)).__dict__ worker_arguments = [ "--worker_id", "1", "--job_type", job_type, "--minibatch_size", batch_size, "--model_zoo", model_zoo_path, "--model_def", model_def, "--model_params", model_params, "--loss", loss, "--get_model_steps", get_model_steps, "--distribution_strategy", distribution_strategy, ] args = parse_worker_args(worker_arguments) if dataset_name in [DatasetName.IMAGENET, DatasetName.FRAPPE]: record_num = batch_size else: record_num = 128 shards = { create_recordio_file(record_num, dataset_name, feature_shape): ( 0, record_num, ) } if training: training_shards = shards evaluation_shards = shards else: training_shards = {} evaluation_shards = shards task_d = _TaskDispatcher( training_shards, evaluation_shards, {}, records_per_task=64, num_epochs=1, ) if training: evaluation_service = EvaluationService( None, task_d, 0, 0, evaluation_steps, False, model_module[eval_metrics_fn], ) else: evaluation_service = EvaluationService( None, task_d, 0, 0, evaluation_steps, True, model_module[eval_metrics_fn], ) task_d.set_evaluation_service(evaluation_service) master = Mock( task_d=task_d, instance_manager=None, distribution_strategy=None, ) def master_creator(): return MasterServicer( batch_size, evaluation_service=evaluation_service, master=master, ) svc, port = _server(master_creator) mc = MasterClient(build_channel("localhost:%d" % port), 1) worker = Worker(args, master_client=mc, ps_client=PSClient(ps_channels)) for pservicer in pservers: # FIXME(yancey1989): decouple pserver and master client pservicer._master_stub = mc worker.run() task = mc.get_task() # stop the master servicer svc.stop(0) # No more task. if task.shard_name: raise RuntimeError( "There are some tasks unfinished after worker exits.") return task.model_version
def _create_model_instance(model_def): module_file = get_module_file_path(_get_model_zoo_path(), model_def) model_module = load_module(module_file).__dict__ return load_model_from_module(model_def, model_module, None)
def distributed_train_and_evaluate( feature_shape, model_zoo_path, model_def, model_params="", eval_metrics_fn="eval_metrics_fn", loss="loss", training=True, dataset_name=DatasetName.IMAGE_DEFAULT, callback_classes=[], use_async=False, get_model_steps=1, ps_channels=None, pservers=None, distribution_strategy=DistributionStrategy.PARAMETER_SERVER, ): """Runs distributed training and evaluation with a local master. Grpc calls are mocked by local master call. Args: feature_shape: The shape of model input. model_zoo_path: The directory that contains user-defined model files or a specific model file. model_def: The import path to the model definition function/class in the model zoo, e.g. "cifar10_subclass.CustomModel". model_params: The dictionary of model parameters in a string that will be used to instantiate the model, e.g. "param1=1,param2=2". eval_metrics_fn: The name of the evaluation metrics function defined in the model file. loss: The name of the loss function defined in the model file. training: True for job type `TRAIN_WITH_EVALUATION`, False for job type `EVALUATION`. dataset_name: A dataset name from `DatasetName`. callback_classes: A List of callbacks that will be called at given stages of the training procedure. use_async: A bool. True if using asynchronous updates. get_model_steps: Worker will perform `get_model` from the parameter server every this many steps. ps_channels: A channel list to all parameter server pods. pservers: A list of parameter server pods. distribution_strategy: The distribution startegy used by workers, e.g. DistributionStrategy.PARAMETER_SERVER or DistributionStrategy.AllreduceStrategy. Returns: An integer indicating the model version after the distributed training and evaluation. """ job_type = (JobType.TRAINING_WITH_EVALUATION if training else JobType.EVALUATION_ONLY) evaluation_steps = 1 if job_type == JobType.TRAINING_WITH_EVALUATION else 0 batch_size = 8 if dataset_name == DatasetName.IMAGENET else 16 pservers = pservers or [] ps_channels = ps_channels or [] model_module = load_module(get_module_file_path(model_zoo_path, model_def)).__dict__ for channel in ps_channels: grpc.channel_ready_future(channel).result() worker_arguments = [ "--worker_id", "1", "--job_type", job_type, "--minibatch_size", batch_size, "--model_zoo", model_zoo_path, "--model_def", model_def, "--model_params", model_params, "--loss", loss, "--get_model_steps", get_model_steps, "--distribution_strategy", distribution_strategy, ] args = parse_worker_args(worker_arguments) worker = Worker(args, ps_channels=ps_channels) if dataset_name in [DatasetName.IMAGENET, DatasetName.FRAPPE]: record_num = batch_size else: record_num = 128 shards = { create_recordio_file(record_num, dataset_name, feature_shape): ( 0, record_num, ) } if training: training_shards = shards evaluation_shards = shards else: training_shards = {} evaluation_shards = shards task_d = _TaskDispatcher( training_shards, evaluation_shards, {}, records_per_task=64, num_epochs=1, ) if training: evaluation_service = EvaluationService( None, task_d, 0, 0, evaluation_steps, False, model_module[eval_metrics_fn], ) else: evaluation_service = EvaluationService( None, task_d, 0, 0, evaluation_steps, True, model_module[eval_metrics_fn], ) task_d.set_evaluation_service(evaluation_service) master = MasterServicer( batch_size, task_d, evaluation_service=evaluation_service, ) callbacks = [ callback_class(master, worker) for callback_class in callback_classes ] in_process_master = InProcessMaster(master, callbacks) worker._stub = in_process_master for pservicer in pservers: pservicer._master_stub = in_process_master worker.run() req = elasticdl_pb2.GetTaskRequest() req.worker_id = 1 task = master.get_task(req, None) # No more task. if task.shard_name: raise RuntimeError( "There are some tasks unfinished after worker exits.") return master._version
def __init__(self, args): self.logger = get_logger("master", level=args.log_level.upper()) self.num_ps_pods = args.num_ps_pods self.checkpoint_output_path = args.checkpoint_dir self.distribution_strategy = args.distribution_strategy # Master addr master_ip = os.getenv("MY_POD_IP", "localhost") self.master_addr = "%s:%d" % (master_ip, args.port) self.job_type = Master._get_job_type(args) self.rendezvous_server = None if self.distribution_strategy == DistributionStrategy.ALLREDUCE: self.rendezvous_server = HorovodRendezvousServer(master_ip) # Initialize TensorBoard service if requested self.tb_service = self._create_tensorboard_service( args.tensorboard_log_dir, master_ip ) if self.tb_service: self.tb_client = TensorBoardClient( job_name=args.job_name, image_name=args.worker_image, namespace=args.namespace, ) # Initialize the components from the model definition self.model_module = load_module( get_module_file_path(args.model_zoo, args.model_def) ).__dict__ self.model_inst = load_model_from_module( args.model_def, self.model_module, args.model_params ) self.optimizer = self.model_module[args.optimizer]() self._create_data_reader_fn = create_data_reader if args.custom_data_reader in self.model_module: self._create_data_reader_fn = self.model_module[ args.custom_data_reader ] # Initialize the callbacks self.callbacks_list = load_callbacks_from_module( args.callbacks, self.model_module ) self.callbacks_list.set_model(self.model_inst) set_callback_parameters( self.callbacks_list, batch_size=args.minibatch_size, saved_model_path=args.output, checkpoint_path=args.checkpoint_dir, ) self._set_completed_steps_by_checkpoint(args.checkpoint_dir_for_init) # Start task queue records_per_task = args.minibatch_size * args.num_minibatches_per_task self.task_d = _make_task_dispatcher( args.training_data, args.validation_data, args.prediction_data, records_per_task, args.num_epochs, args.data_reader_params, self._create_data_reader_fn, self.callbacks_list, ) self.task_d.add_deferred_callback_create_train_end_task() self.evaluation_service = self._create_evaluation_service(args) # Initialize instance manager self.instance_manager = self._create_instance_manager(args) # Initialize master service self.master_servicer, self.server = self._create_master_service(args) self._should_stop = False self._exit_code = 0 threading.Thread( target=self._check_timeout_tasks, name="check_timeout_tasks", daemon=True, ).start()
import os import tempfile import unittest from elasticdl.proto import elasticdl_pb2 from elasticdl.python.common.model_utils import ( get_module_file_path, load_module, ) from elasticdl.python.master.checkpoint_service import CheckpointService from elasticdl.python.master.servicer import MasterServicer _model_zoo_path = os.path.dirname(os.path.realpath(__file__)) _model_file = get_module_file_path(_model_zoo_path, "test_module.custom_model") m = load_module(_model_file).__dict__ class CheckpointTest(unittest.TestCase): def testNeedToCheckpoint(self): checkpointer = CheckpointService("", 0, 5, False) self.assertFalse(checkpointer.is_enabled()) checkpointer._steps = 3 self.assertTrue(checkpointer.is_enabled()) self.assertFalse(checkpointer.need_to_checkpoint(1)) self.assertFalse(checkpointer.need_to_checkpoint(2)) self.assertTrue(checkpointer.need_to_checkpoint(3)) self.assertFalse(checkpointer.need_to_checkpoint(4)) self.assertFalse(checkpointer.need_to_checkpoint(5)) self.assertTrue(checkpointer.need_to_checkpoint(6))