def _driver_fn(client, net_if): cluster_tasks = _task_commons._get_cluster_tasks(client) # Worker discovery worker_list = [f"{net_if[1]}:{N_PROCESS_PER_WORKER}"] n_workers = 1 for cluster_task in cluster_tasks: if 'worker' in cluster_task: worker_addr = event.wait(client, f"{cluster_task}/addr") logger.info(f"{cluster_task}: {worker_addr}") worker_list.append(f"{worker_addr}:{N_PROCESS_PER_WORKER}") n_workers += 1 # Worker task allocation to workers hosts = gloo_run.parse_hosts(','.join(worker_list)) host_alloc_plan = gloo_run.get_host_assignments(hosts, n_workers) for host in host_alloc_plan: host_info = f"""\ {host.rank},{host.size},{host.local_rank},\ {host.local_size},{host.cross_rank},{host.cross_size}\ """ event.broadcast(client, f"{get_task()}/{host.hostname}", host_info) global_rendezv = RendezvousServer(verbose=1) global_rendezv_port = global_rendezv.start_server() global_rendezv.httpd.init(host_alloc_plan) event.broadcast(client, f"{get_task()}/sock_addr", f"{net_if[1]}:{global_rendezv_port}") return global_rendezv.listen_thread
def _worker_fn(client, task, net_if): event.broadcast(client, f"{task}/addr", net_if[1]) worker_info = event.wait(client, f"chief:0/{net_if[1]}").split(',') driver_socket = event.wait(client, "chief:0/sock_addr").split(':') os.environ['HOROVOD_GLOO_RENDEZVOUS_ADDR'] = driver_socket[0] os.environ['HOROVOD_GLOO_RENDEZVOUS_PORT'] = driver_socket[1] os.environ['HOROVOD_CONTROLLER'] = 'gloo' os.environ['HOROVOD_CPU_OPERATIONS'] = 'gloo' os.environ['HOROVOD_GLOO_IFACE'] = net_if[0] os.environ['HOROVOD_RANK'] = worker_info[0] os.environ['HOROVOD_SIZE'] = worker_info[1] os.environ['HOROVOD_LOCAL_RANK'] = worker_info[2] os.environ['HOROVOD_LOCAL_SIZE'] = worker_info[3] os.environ['HOROVOD_CROSS_RANK'] = worker_info[4] os.environ['HOROVOD_CROSS_SIZE'] = worker_info[5] hvd.init() experiment = _task_commons._get_experiment(client) if isinstance(experiment, Experiment): if not is_chief(get_task_type(task)): # Overwrite config to do nothing but training to improve training speed experiment.estimator._model_dir = "." new_config = experiment.estimator.config.replace( save_summary_steps=None, save_checkpoints_steps=None, save_checkpoints_secs=None, log_step_count_steps=None) experiment.estimator._config = new_config logger.info("start training..") experiment.estimator.train(experiment.train_spec.input_fn, hooks=experiment.train_spec.hooks, max_steps=experiment.train_spec.max_steps) elif isinstance(experiment, KerasExperiment): if not is_chief(get_task_type(task)): if experiment.train_params['callbacks'] is not None: callbacks_to_keep = [] for callback in experiment.train_params['callbacks']: if not isinstance(callback, tf.keras.callbacks.ModelCheckpoint): callbacks_to_keep.append(callback) experiment.train_params['callbacks'] = callbacks_to_keep if experiment.input_data_fn is not None: experiment.train_params['x'] = experiment.input_data_fn() if experiment.target_data_fn is not None: experiment.train_params['y'] = experiment.target_data_fn() logger.info("start training..") experiment.model.fit(**experiment.train_params) else: raise ValueError( "experiment must be an Experiment or a KerasExperiment")
def _setup_master(client: skein.ApplicationClient, rank: int) -> None: if rank == 0: with _internal.reserve_sock_addr() as host_port: event.broadcast(client, MASTER_ADDR, host_port[0]) event.broadcast(client, MASTER_PORT, str(host_port[1])) os.environ[MASTER_ADDR] = host_port[0] os.environ[MASTER_PORT] = str(host_port[1]) else: master_addr = event.wait(client, MASTER_ADDR) master_port = event.wait(client, MASTER_PORT) os.environ[MASTER_ADDR] = master_addr os.environ[MASTER_PORT] = master_port
def _start_tracker(client, n_workers: int): tf.logging.info(f"Starting tracker with {n_workers} workers") rabit_context = tracker.RabitTracker( hostIP=tracker.get_host_ip(), nslave=n_workers, # will do bind(0) -> choose a random port port=0, port_end=1) rabit_context.start(n_workers) thread = Thread(target=rabit_context.join, daemon=True) thread.start() event.broadcast( client, f"{cluster.get_task()}/tracker", f"{rabit_context.hostIP}:{rabit_context.port}" ) return thread
def client_tf(client): spec = create_skein_app() app = client.submit_and_connect(spec) x = tf.placeholder(tf.float32, 100) with tf.device(f"/job:{NODE_NAME}/task:1"): first_batch = tf.slice(x, [0], [50]) mean1 = tf.reduce_mean(first_batch) with tf.device(f"/job:{NODE_NAME}/task:0"): second_batch = tf.slice(x, [50], [-1]) mean2 = tf.reduce_mean(second_batch) mean = (mean1 + mean2) / 2 first_task = event.wait(app, f"{NODE_NAME}:0/init") with tf.Session(f"grpc://{first_task}") as sess: result = sess.run(mean, feed_dict={x: np.random.random(100)}) print(f"mean = {result}") event.broadcast(app, "stop", "1")
def _worker_fn(client, task, net_if): event.broadcast(client, f"{task}/addr", net_if[1]) worker_info = event.wait(client, f"chief:0/{net_if[1]}").split(',') driver_socket = event.wait(client, "chief:0/sock_addr").split(':') os.environ['HOROVOD_GLOO_RENDEZVOUS_ADDR'] = driver_socket[0] os.environ['HOROVOD_GLOO_RENDEZVOUS_PORT'] = driver_socket[1] os.environ['HOROVOD_CONTROLLER'] = 'gloo' os.environ['HOROVOD_CPU_OPERATIONS'] = 'gloo' os.environ['HOROVOD_GLOO_IFACE'] = net_if[0] os.environ['HOROVOD_RANK'] = worker_info[0] os.environ['HOROVOD_SIZE'] = worker_info[1] os.environ['HOROVOD_LOCAL_RANK'] = worker_info[2] os.environ['HOROVOD_LOCAL_SIZE'] = worker_info[3] os.environ['HOROVOD_CROSS_RANK'] = worker_info[4] os.environ['HOROVOD_CROSS_SIZE'] = worker_info[5] hvd.init() experiment = _task_commons._get_experiment(client) if task != 'chief:0': # Overwrite config to do nothing but training to improve training speed experiment.estimator._model_dir = "." new_config = experiment.estimator.config.replace( save_summary_steps=None, save_checkpoints_steps=None, save_checkpoints_secs=None, log_step_count_steps=None) experiment.estimator._config = new_config logger.info("start training..") experiment.estimator.train(experiment.train_spec.input_fn, hooks=experiment.train_spec.hooks, max_steps=experiment.train_spec.max_steps)
def standalone_client_mode(pyenv_zip_path: Union[str, Dict[NodeLabel, str]], task_specs: Dict[str, TaskSpec] = None, tf_session_config: Optional[tf.ConfigProto] = None, *, skein_client: skein.Client = None, files: Dict[str, str] = None, env: Dict[str, str] = {}, queue: str = "default", acls: ACLs = None, file_systems: List[str] = None, log_conf_file: str = None): """ https://github.com/tensorflow/tensorflow/blob/r1.13/tensorflow \ /contrib/distribute/README.md#standalone-client-mode Standalone mode means starting tf server on the cluster, launching everything on the client and letting tf take care of the rest This is not limited to Estimator API, it also works with low level tf (see session_run_example.py) Parameters ---------- pyenv_zip_path Path to an archive of a python environment to be deployed It can be a zip conda env or a pex archive In case of GPU/CPU cluster, provide a dictionnary with both environments. skein_client Skein client to submit yarn jobs task_specs Resources to allocate for each task type. The keys must be a subset of ``"chief"``, ``"worker"``, ``"ps"``, and ``"evaluator"``. The minimal spec must contain at least ``"chief"``. tf_session_config tf.ConfigProto to be provided to each started TFServer files Local files or directories to upload to the container. The keys are the target locations of the resources relative to the container root, while the values -- their corresponding local sources. Note that container root is appended to ``PYTHONPATH``. Therefore, any listed Python module a package is automatically importable. env Environment variables to forward to the containers. queue YARN queue to use. acls Configures the application-level Access Control Lists (ACLs). Optional, defaults to no ACLs. See `ACLs <https://jcrist.github.io/skein/specification.html#id16>` for details. file_systems A list of namenode URIs to acquire delegation tokens for in addition to ``fs.defaultFS``. log_conf_file optional file with log config, setups logging by default with INFO verbosity, if you specify a file here don't forget to also ship it to the containers via files arg """ cluster = None try: pyenvs = _setup_pyenvs(pyenv_zip_path, standalone_client_mode=True) cluster = _setup_skein_cluster(pyenvs=pyenvs, skein_client=skein_client, task_specs=StaticDefaultDict( task_specs, default=TASK_SPEC_NONE), files=files, env=env, queue=queue, acls=acls, file_systems=file_systems, log_conf_file=log_conf_file, standalone_client_mode=True) _send_config_proto(cluster, tf_session_config) yield cluster.cluster_spec finally: if cluster: broadcast(cluster.app, "stop", "1")
def broadcast(self, key: str, value: str): broadcast(self.client, f'{self.task}/{key}', value)
def standalone_client_mode( pyenv_zip_path: Union[str, Dict[topologies.NodeLabel, str]], task_specs: Dict[str, topologies.TaskSpec] = TASK_SPEC_NONE, tf_session_config: Optional[tf.ConfigProto] = None, *, skein_client: skein.Client = None, files: Dict[str, str] = None, env: Dict[str, str] = {}, queue: str = "default", acls: ACLs = _default_acls_all_access(), file_systems: List[str] = None, name: str = "RunOnYarn", pre_script_hook: Optional[str] = None ): """ https://github.com/tensorflow/tensorflow/blob/r1.13/tensorflow \ /contrib/distribute/README.md#standalone-client-mode Standalone mode means starting tf server on the cluster, launching everything on the client and letting tf take care of the rest This is not limited to Estimator API, it also works with low level tf (see session_run_example.py) Parameters ---------- pyenv_zip_path Path to an archive of a python environment to be deployed It can be a zip conda env or a pex archive In case of GPU/CPU cluster, provide a dictionnary with both environments. skein_client Skein client to submit yarn jobs task_specs Resources to allocate for each task type. The keys must be a subset of ``"chief"``, ``"worker"``, ``"ps"``, and ``"evaluator"``. The minimal spec must contain at least ``"chief"``. tf_session_config tf.ConfigProto to be provided to each started TFServer files Local files or directories to upload to the container. The keys are the target locations of the resources relative to the container root, while the values -- their corresponding local sources. Note that container root is appended to ``PYTHONPATH``. Therefore, any listed Python module a package is automatically importable. env Environment variables to forward to the containers. queue YARN queue to use. acls Configures the application-level Access Control Lists (ACLs). Optional, defaults to ACLs all access. See `ACLs <https://jcrist.github.io/skein/specification.html#acls>` for details. file_systems A list of namenode URIs to acquire delegation tokens for in addition to ``fs.defaultFS``. name Name of the yarn application pre_script_hook bash command to prepare Hadoop environment """ try: pyenvs = _setup_pyenvs(pyenv_zip_path) skein_cluster = _setup_skein_cluster( pyenvs=pyenvs, task_specs=task_specs, standalone_client_mode=True, skein_client=skein_client, files=files, env=env, queue=queue, acls=acls, file_systems=file_systems, name=name, pre_script_hook=pre_script_hook ) with _shutdown_on_exception(skein_cluster.app): cluster_spec = _setup_cluster_spec(skein_cluster.tasks, skein_cluster.app, True) _send_config_proto(skein_cluster, tf_session_config) yield cluster_spec finally: if skein_cluster: event.broadcast(skein_cluster.app, "stop", "1")