def print_args(args, groups=None): """ Args: args: parsing results returned from `parser.parse_args` groups: It is a list of a list. It controls which options should be printed together. For example, we expect all model specifications such as `optimizer`, `loss` are better printed together. groups = [["optimizer", "loss"]] """ def _get_attr(instance, attribute): try: return getattr(instance, attribute) except AttributeError: return None dedup = set() if groups: for group in groups: for element in group: dedup.add(element) logger.info("%s = %s", element, _get_attr(args, element)) other_options = [(key, value) for (key, value) in args.__dict__.items() if key not in dedup] for key, value in other_options: logger.info("%s = %s", key, value)
def _event_cb(self, event): evt_obj = event.get("object") evt_type = event.get("type") if not evt_obj or not evt_type: logger.error("Event doesn't have object or type: %s" % event) return pod_name = evt_obj.metadata.name phase = evt_obj.status.phase logger.info("Got event %s, phase %s for pod: %s" % (evt_type, phase, pod_name)) relaunch = False with self._lock: worker_id = self._pod_name_to_id.get(pod_name) if (worker_id is None and pod_name != self._k8s_client.get_master_pod_name()): logger.error("Unknown worker pod name: %s" % pod_name) return self._pods_phase[worker_id] = (pod_name, phase) if evt_type == "DELETED": del self._pods_phase[worker_id] del self._pod_name_to_id[pod_name] self._task_d.recover_tasks(worker_id) # If a deleted pod was not "Succeeded", relaunch a worker. relaunch = (self._relaunch_deleted_live_worker and phase != "Succeeded") if relaunch: logger.info("Relaunching worker.") self._start_worker(self._next_worker_id())
def create_master(self, **kargs): env = [ V1EnvVar( name="MY_POD_IP", value_from=V1EnvVarSource(field_ref=V1ObjectFieldSelector( field_path="status.podIP")), ) ] if "envs" in kargs: for key in kargs["envs"]: env.append(V1EnvVar(name=key, value=kargs["envs"][key])) pod = self._create_pod( pod_name=self.get_master_pod_name(), job_name=self.job_name, image_name=self._image_name, command=["python"], resource_requests=kargs["resource_requests"], resource_limits=kargs["resource_limits"], container_args=kargs["args"], pod_priority=kargs["pod_priority"], image_pull_policy=kargs["image_pull_policy"], restart_policy=kargs["restart_policy"], volume=kargs["volume"], owner_pod=None, env=env, ) # Add replica type and index pod.metadata.labels[ELASTICDL_REPLICA_TYPE_KEY] = "master" pod.metadata.labels[ELASTICDL_REPLICA_INDEX_KEY] = "0" self.client.create_namespaced_pod(self.namespace, pod) logger.info("Master launched.")
def delete_master(self): logger.info("pod name is %s" % self.get_master_pod_name()) self.client.delete_namespaced_pod( self.get_master_pod_name(), self.namespace, body=client.V1DeleteOptions(grace_period_seconds=0), )
def process(self, predictions, worker_id): if self.odps_writer: self.odps_writer.from_iterator( iter(predictions.numpy().tolist()), worker_id ) else: logger.info(predictions.numpy())
def _gen(self): """ A generator supports the iter() protocol (e.g. a generator function), which is used to create a dataset for RecordIO. """ while True: task = self._worker.get_task() if not task.shard_file_name: if task.type == elasticdl_pb2.WAIT: self._pending_dataset = True logger.info( "Finish current dataset, maybe more data later") else: logger.info("No more task, stopping") break with self._lock: if (self._training_with_evaluation and task.type == elasticdl_pb2.EVALUATION): self._pending_eval_tasks.append(task) continue self._record_count += task.end - task.start self._pending_tasks_with_counts.append( (task, self._record_count)) if len(self._pending_tasks_with_counts) == 1: self._current_task = task with closing( recordio.Scanner(task.shard_file_name, task.start, task.end - task.start)) as reader: while True: record = reader.record() if record: yield record else: break
def stop_embedding_service(self, save="nosave"): failed_redis_nodes = [] for redis_node in [ "-h %s -p %d" % (ip, port) for ip, port_list in self._embedding_service_endpoint.items() for port in port_list ]: command = "redis-cli %s shutdown %s" % (redis_node, save) return_code = self._run_shell_command(command) if return_code: failed_redis_nodes.append(redis_node) if failed_redis_nodes: failed_redis_nodes = [i.split(" ") for i in failed_redis_nodes] logger.info( "Stop these redis nodes failed: %s." % ";".join( [ "%s:%s" % (redis_node[1], redis_node[3]) for redis_node in failed_redis_nodes ] ) ) return False return True
def _gen(self): """ A generator supports the iter() protocol (e.g. a generator function), used to create a `tf.data.Dataset` from a list of tasks. """ while True: task = self._worker.get_task() if not task.shard_name: if task.type == elasticdl_pb2.WAIT: self._pending_dataset = True logger.info( "Finish current dataset, maybe more data later") else: logger.info("No more task, stopping") break with self._lock: if (self._training_with_evaluation and task.type == elasticdl_pb2.EVALUATION): self._pending_eval_tasks.append(task) continue self._record_count += task.end - task.start self._pending_tasks_with_counts.append( (task, self._record_count)) if len(self._pending_tasks_with_counts) == 1: self._current_task = task for data in self._data_reader.read_records(task): if data: yield data
def start_redis_service(self): args = self._parse_embedding_service_args() logger.info("Starting redis server on ports: %d - %d, " "--cluster_node_timeout %d" % ( args.first_port, args.first_port + args.num_of_redis_instances - 1, args.cluster_node_timeout, )) failed_port = [] for i in range(args.num_of_redis_instances): port = args.first_port + i command = ( "redis-server --port %d --cluster-enabled yes " "--cluster-config-file nodes-%d.conf --cluster-node-timeout" " %d --appendonly yes --appendfilename appendonly-%d.aof " "--dbfilename dump-%d.rdb --logfile %d.log --daemonize yes " "--protected-mode no" % (port, port, args.cluster_node_timeout, port, port, port)) return_code = self._run_shell_command(command) if return_code: failed_port.append(port) if failed_port: local_ip = os.getenv("MY_POD_IP", "localhost") logger.info("%s starts these redis instances failed: %s" % (local_ip, ";".join(map(str, failed_port))))
def create_tasks(self, task_type, model_version=-1): logger.info( "Creating a new set of %s tasks for model version %d", elasticdl_pb2._TASKTYPE.values_by_number[task_type].name.lower(), model_version, ) if task_type == elasticdl_pb2.TRAINING: shards = self._training_shards elif task_type == elasticdl_pb2.EVALUATION: shards = self._evaluation_shards else: shards = self._prediction_shards tasks = [] for name, num_records in shards.items(): for start in range(0, num_records, self._records_per_task): tasks.append( _Task( shard_name=name, start=start, end=min(start + self._records_per_task, num_records), type=task_type, model_version=model_version, )) if task_type == elasticdl_pb2.TRAINING: random.shuffle(tasks) self._todo.extend(tasks) else: with self._lock: self._todo.extend(tasks) return tasks
def create_training_tasks(self): logger.info("Creating a new set of training tasks with epoch=%d", self._epoch) tasks = self._create_tasks(self._training_shards, elasticdl_pb2.TRAINING) random.shuffle(tasks) self._todo.extend(tasks) return tasks
def start_tensorboard_service(self): self._create_tensorboard_service() logger.info("Waiting for the URL for TensorBoard service...") tb_url = self._get_tensorboard_url() if tb_url: logger.info("TensorBoard service is available at: %s" % tb_url) else: logger.warning("Unable to get the URL for TensorBoard service")
def _print_docker_progress(line): error = line.get("error", None) if error: raise RuntimeError("Docker image build: " + error) stream = line.get("stream", None) if stream: logger.info(stream) else: logger.info(line)
def _remove_worker(self, worker_id): logger.info("Removing worker: %d", worker_id) with self._lock: if worker_id not in self._pods_phase: logger.error("Unknown worker id: %s" % worker_id) return # TODO: change _k8s_client to accept pod name instead of worker id. self._k8s_client.delete_worker(worker_id)
def _build_docker_image(client, ctx_dir, dockerfile, image_name): logger.info("===== Building Docker Image =====") for line in client.build( dockerfile=dockerfile, path=ctx_dir, rm=True, tag=image_name, decode=True, ): _print_docker_progress(line)
def copy_if_not_exists(src, dst, is_dir): if os.path.exists(dst): logger.info( "Skip copying from %s to %s since the destination already exists" % (src, dst) ) else: if is_dir: shutil.copytree(src, dst) else: shutil.copy(src, dst)
def _init_model(self, checkpoint_filename_for_init, init_var): if checkpoint_filename_for_init: pb_model = load_from_checkpoint_file(checkpoint_filename_for_init) self._version = pb_model.version self._init_model_from_tensor_dict(pb_model.param) elif init_var: self._init_model_from_var_list(init_var) else: logger.info("Model is not intialized. It will be " "initialized by the first update from " "the worker.")
def create_evaluation_tasks(self, eval_model_version): logger.info( "Creating a new set of evaluation tasks for model version %d", eval_model_version, ) tasks = self._create_tasks( self._evaluation_shards, elasticdl_pb2.EVALUATION, eval_model_version, ) with self._lock: self._todo.extend(tasks) return tasks
def create_prediction_tasks(self, predict_model_version): logger.info( "Creating a new set of prediction tasks for model version %d", predict_model_version, ) tasks = self._create_tasks( self._prediction_shards, elasticdl_pb2.PREDICTION, predict_model_version, ) with self._lock: self._todo.extend(tasks) return tasks
def _process_minibatch( self, task_type, features, labels, min_model_version, train_with_local_model=False, ): if self._need_embedding_layer_check or not self._var_created: self._run_model_call_before_training(features) for _ in range(self._max_minibatch_retry_num): if task_type == elasticdl_pb2.EVALUATION: if min_model_version == -1: if self._model_version < 0: self.get_model(0, elasticdl_pb2.MINIMUM) elif self._model_version != min_model_version: self.get_model(min_model_version, elasticdl_pb2.FIXED) accepted = self._run_evaluation_task(features, labels) if accepted: break elif task_type == elasticdl_pb2.TRAINING: # TODO: optimize the logic to avoid unnecessary # get_model call. if not train_with_local_model: self.get_model( max(self._model_version, min_model_version), elasticdl_pb2.MINIMUM, ) accepted, min_model_version, loss = self._run_training_task( features, labels) if accepted: logger.info("Loss is %f" % loss.numpy()) break elif task_type == elasticdl_pb2.PREDICTION: if self._model_version != min_model_version: self.get_model(min_model_version, elasticdl_pb2.FIXED) accepted = self._run_prediction_task(features) if accepted: break else: raise RuntimeError("Unrecognized task type, %s" % task_type) else: # Worker got stuck, fail the task. # TODO: stop the worker if it fails to make any # progress for some time. raise RuntimeError("Worker got stuck") return min_model_version
def _save_checkpoint(self, locking, is_eval_checkpoint): try: logger.info("Saving checkpoint for model version %d" % self._version) if locking: self._lock.acquire() pb_model = self._get_model_no_lock() self._checkpoint_service.save(self._version, pb_model, is_eval_checkpoint) checkpoint_version = self._version if locking: self._lock.release() return checkpoint_version except Exception: logger.error( "Failed to save checkpoint file for model version %d" % self._version)
def complete_task(self): self._eval_job.complete_task() if self._eval_job.finished(): evaluation_metrics = self._eval_job.get_evaluation_summary() if self._tensorboard_service and evaluation_metrics: self._tensorboard_service.write_dict_to_summary( evaluation_metrics, version=self._eval_job.model_version) logger.info("Evaluation metrics[v=%d]: %s" % ( self._eval_job.model_version if self._eval_job.model_version >= 0 else self._master_servicer.get_model_version(), str(evaluation_metrics), )) if not self._eval_only: # delete checkpoint file self._checkpoint_service.remove_eval_checkpoint( self._eval_job.model_version) self._eval_job = None # create new eval job if possible self.try_to_create_new_job()
def _start_worker(self, worker_id): logger.info("Starting worker: %d" % worker_id) with self._lock: pod = self._k8s_client.create_worker( worker_id=worker_id, resource_requests=self._resource_requests, resource_limits=self._resource_limits, pod_priority=self._pod_priority, volume=self._volume, image_pull_policy=self._image_pull_policy, command=self._command, args=self._args + ["--worker_id", str(worker_id)], restart_policy=self._restart_policy, envs=self._envs, ) name = pod.metadata.name self._pod_name_to_id[name] = worker_id self._pods_phase[worker_id] = (name, None)
def __init__( self, training_shards, evaluation_shards, prediction_shards, records_per_task, num_epochs, ): """ Arguments: training_shards: A dictionary from RecordIO file name to the number of training records. evaluation_shards: A dictionary from RecordIO file name to the number of evaluation records. prediction_shards: A dictionary from RecordIO file name to the number of prediction records. records_per_task: The number of records per task. num_epochs: The total number of epochs for the tasks where an epoch is a complete iteration over the shards. """ self._lock = threading.Lock() self._num_epochs = num_epochs self._epoch = 0 self._training_shards = training_shards self._evaluation_shards = evaluation_shards self._prediction_shards = prediction_shards self._records_per_task = records_per_task self._todo = [] # dictionary from task id to Task. self._doing = {} self._task_id = 0 self._evaluation_service = None if self._training_shards: logger.info("Starting epoch %d", self._epoch) self.create_tasks(elasticdl_pb2.TRAINING) elif self._evaluation_shards: self.create_tasks(elasticdl_pb2.EVALUATION) elif self._prediction_shards: self.create_tasks(elasticdl_pb2.PREDICTION)
def _submit_job(image_name, client_args, container_args): client = k8s.Client( image_name=image_name, namespace=client_args.namespace, job_name=client_args.job_name, event_callback=None, cluster_spec=client_args.cluster_spec, ) client.create_master( resource_requests=client_args.master_resource_request, resource_limits=client_args.master_resource_limit, args=container_args, pod_priority=client_args.master_pod_priority, image_pull_policy=client_args.image_pull_policy, restart_policy=client_args.restart_policy, volume=client_args.volume, envs=parse_envs(client_args.envs), ) logger.info( "ElasticDL job %s was successfully submitted. The master pod is: %s." % (client_args.job_name, client.get_master_pod_name()))
def get(self, worker_id): """Return next (task_id, Task) tuple""" with self._lock: # TODO: check if task queue doesn't have training task, # to avoid the queue is overwhelmed by evaluation tasks. if not self._todo and self._epoch < self._num_epochs - 1: # Start a new epoch self._epoch += 1 self.create_tasks(elasticdl_pb2.TRAINING) logger.info("Starting epoch %d", self._epoch) if not self._todo: # No more tasks return -1, None self._task_id += 1 task = self._todo.pop() # TODO: Handle timeout of tasks. self._doing[self._task_id] = (worker_id, task) return self._task_id, task
def report(self, task_id, success): """Report if the task is successful or not""" evaluation_task_completed = False with self._lock: _, task = self._doing.pop(task_id, (-1, None)) if not task: logger.warning("Unknown task_id: %d" % task_id) elif not success: # TODO: keep count of retries. self._todo.append(task) elif (task.type == elasticdl_pb2.EVALUATION and self._evaluation_service is not None): evaluation_task_completed = True else: logger.info( "Task:%d completed, %d remaining tasks", task_id, len(self._todo) + len(self._doing), ) if evaluation_task_completed: self._evaluation_service.complete_task()
def start_embedding_pod_and_redis( self, command, args, embedding_service_id=0, resource_request="cpu=1,memory=4096Mi", resource_limit="cpu=1,memory=4096Mi", pod_priority=None, volume=None, image_pull_policy=None, restart_policy="Never", **kargs, ): logger.info("Starting pod for embedding service ...") self._k8s_client = k8s.Client(event_callback=None, **kargs) pod = self._k8s_client.create_embedding_service( worker_id=embedding_service_id, resource_requests=resource_request, resource_limits=resource_limit, pod_priority=pod_priority, volume=volume, image_pull_policy=image_pull_policy, command=command, args=args, restart_policy=restart_policy, ) # TODO: assign address with pod's domain name instead of pod's ip. # and should not fix ports address_ip = pod.status.pod_ip while not address_ip: pod = self._k8s_client.get_embedding_service_pod( embedding_service_id) address_ip = pod.status.pod_ip self._embedding_service_endpoint = { address_ip: [30001 + i for i in range(6)] }
def write_to_recordio(filename, data_list): logger.info("Writing to file:", filename) with closing(recordio.Writer(filename)) as f: for d in data_list: f.write(d)
def _push_docker_image(client, image_name): logger.info("===== Pushing Docker Image =====") for line in client.push(image_name, stream=True, decode=True): _print_docker_progress(line)