def test__get_experiment_exception(): with contextlib.ExitStack() as stack: stack.enter_context(patch(f'{MODULE_TO_TEST}.cluster')) mocked_event = stack.enter_context(patch(f'{MODULE_TO_TEST}.event')) mocked_client = mock.MagicMock(spec=skein.ApplicationClient) def experiment_f(): raise Exception() mocked_client.kv.wait.return_value = cloudpickle.dumps(experiment_f) with pytest.raises(Exception): _get_experiment(mocked_client) mocked_event.start_event.assert_called_once() mocked_event.stop_event.assert_called_once()
def _evaluate(stop): experiment = _task_commons._get_experiment(client) time.sleep(experiment.eval_spec.start_delay_secs) evaluated_checkpoints = set() while True: latest_checkpoint = experiment.estimator.latest_checkpoint() latest_eval_result = None if latest_checkpoint and latest_checkpoint not in evaluated_checkpoints: latest_eval_result = experiment.estimator.evaluate( experiment.eval_spec.input_fn, steps=experiment.eval_spec.steps, hooks=experiment.eval_spec.hooks, name=experiment.eval_spec.name ) if experiment.train_spec.max_steps: if latest_eval_result and latest_eval_result.status == _EvalStatus.EVALUATED: global_step = latest_eval_result.metrics.get(ops.GraphKeys.GLOBAL_STEP) if global_step and global_step >= experiment.train_spec.max_steps: break else: if stop(): break time.sleep(experiment.eval_spec.throttle_secs)
def main() -> None: _task_commons._log_sys_info() task_type, task_id = cluster.get_task_description() with _internal.reserve_sock_addr() as host_port: client, cluster_spec, cluster_tasks = _task_commons._prepare_container( host_port) # Variable TF_CONFIG must be set before instantiating # the estimator to train in a distributed way cluster.setup_tf_config(cluster_spec) experiment = _task_commons._get_experiment(client) if isinstance(experiment, Experiment): session_config = experiment.config.session_config elif isinstance(experiment, KerasExperiment): raise ValueError( "KerasExperiment using parameter strategy is unsupported") else: raise ValueError( "experiment must be an Experiment or a KerasExperiment") _logger.info(f"Starting server {task_type}:{task_id}") cluster.start_tf_server(cluster_spec, session_config) thread = _task_commons._execute_dispatched_function(client, experiment) # "ps" tasks do not terminate by themselves. See # https://github.com/tensorflow/tensorflow/issues/4713. if task_type not in ['ps']: thread.join() _logger.info(f"{task_type}:{task_id} {thread.state}") _task_commons._shutdown_container(client, cluster_tasks, session_config, thread)
def main() -> None: _log_sys_info() task_type, task_id = get_task_description() client = skein.ApplicationClient.from_current() experiment = _get_experiment(client) assert isinstance(experiment, PytorchExperiment) cluster_tasks = _get_cluster_tasks(client) n_workers_per_executor = experiment.n_workers_per_executor world_size = len([t for t in cluster_tasks if "worker" in t ]) * n_workers_per_executor _logger.info(f"Task type: {task_type}; Task id: {task_id};" f"World_size: {world_size}: Cluster tasks: {cluster_tasks}") if n_workers_per_executor > 1: workers = list() mp.set_start_method("spawn", force=True) for n in range(n_workers_per_executor): worker = mp.Process( target=_train, args=(_get_device(n), (task_id * n_workers_per_executor) + n, world_size, _get_collective_ops_backend(n_workers_per_executor))) worker.start() workers.append(worker) for worker in workers: worker.join() else: _train(0, task_id, world_size, "nccl")
def _worker_fn(task_type, task_id, client): os.environ['DMLC_RANK'] = "0" if task_type == 'chief' else f"{task_id + 1}" os.environ['DMLC_ROLE'] = "worker" cluster_tasks = _task_commons._get_cluster_tasks(client) logger.info(cluster_tasks) if task_type == 'chief': _start_tracker(client, len(cluster_tasks)) _setup_tracker(client) rabit.init() experiment = _task_commons._get_experiment(client) if task_type != 'chief': # Overwrite config to do nothing but training to improve training speed experiment.estimator._model_dir = "." new_config = experiment.estimator.config.replace( save_summary_steps=None, save_checkpoints_steps=None, save_checkpoints_secs=None, log_step_count_steps=None ) experiment.estimator._config = new_config logger.info(f"start training..") experiment.estimator.train( experiment.train_spec.input_fn, hooks=experiment.train_spec.hooks, max_steps=experiment.train_spec.max_steps)
def main() -> None: _task_commons._log_sys_info() task_type, task_id = cluster.get_task_description() task = cluster.get_task() client = skein.ApplicationClient.from_current() _task_commons._setup_container_logs(client) cluster_tasks = _task_commons._get_cluster_tasks(client) model_dir = os.getenv('TB_MODEL_DIR', "") if not model_dir: _logger.info("Read model_dir from estimator config") experiment = _task_commons._get_experiment(client) model_dir = experiment.estimator.config.model_dir _logger.info(f"Starting tensorboard on {model_dir}") thread = _internal.MonitoredThread(name=f"{task_type}:{task_id}", target=tensorboard.start_tf_board, args=(client, model_dir), daemon=True) thread.start() for cluster_task in cluster_tasks: event.wait(client, f"{cluster_task}/stop") timeout = tensorboard.get_termination_timeout() thread.join(timeout) event.stop_event(client, task, thread.exception) event.broadcast_container_stop_time(client, task)
def evaluator_fn(client): experiment = _task_commons._get_experiment(client) if isinstance(experiment, Experiment): evaluate(experiment, timeout_in_secs=1200) # Timeout after 20min elif isinstance(experiment, KerasExperiment): keras_evaluate(experiment, timeout_in_secs=1200) # Timeout after 20min else: raise ValueError( "experiment must be an Experiment or a KerasExperiment")
def _worker_fn(client, task, net_if): event.broadcast(client, f"{task}/addr", net_if[1]) worker_info = event.wait(client, f"chief:0/{net_if[1]}").split(',') driver_socket = event.wait(client, "chief:0/sock_addr").split(':') os.environ['HOROVOD_GLOO_RENDEZVOUS_ADDR'] = driver_socket[0] os.environ['HOROVOD_GLOO_RENDEZVOUS_PORT'] = driver_socket[1] os.environ['HOROVOD_CONTROLLER'] = 'gloo' os.environ['HOROVOD_CPU_OPERATIONS'] = 'gloo' os.environ['HOROVOD_GLOO_IFACE'] = net_if[0] os.environ['HOROVOD_RANK'] = worker_info[0] os.environ['HOROVOD_SIZE'] = worker_info[1] os.environ['HOROVOD_LOCAL_RANK'] = worker_info[2] os.environ['HOROVOD_LOCAL_SIZE'] = worker_info[3] os.environ['HOROVOD_CROSS_RANK'] = worker_info[4] os.environ['HOROVOD_CROSS_SIZE'] = worker_info[5] hvd.init() experiment = _task_commons._get_experiment(client) if isinstance(experiment, Experiment): if not is_chief(get_task_type(task)): # Overwrite config to do nothing but training to improve training speed experiment.estimator._model_dir = "." new_config = experiment.estimator.config.replace( save_summary_steps=None, save_checkpoints_steps=None, save_checkpoints_secs=None, log_step_count_steps=None) experiment.estimator._config = new_config logger.info("start training..") experiment.estimator.train(experiment.train_spec.input_fn, hooks=experiment.train_spec.hooks, max_steps=experiment.train_spec.max_steps) elif isinstance(experiment, KerasExperiment): if not is_chief(get_task_type(task)): if experiment.train_params['callbacks'] is not None: callbacks_to_keep = [] for callback in experiment.train_params['callbacks']: if not isinstance(callback, tf.keras.callbacks.ModelCheckpoint): callbacks_to_keep.append(callback) experiment.train_params['callbacks'] = callbacks_to_keep if experiment.input_data_fn is not None: experiment.train_params['x'] = experiment.input_data_fn() if experiment.target_data_fn is not None: experiment.train_params['y'] = experiment.target_data_fn() logger.info("start training..") experiment.model.fit(**experiment.train_params) else: raise ValueError( "experiment must be an Experiment or a KerasExperiment")
def test__get_experiment_object(): mocked_client = mock.MagicMock(spec=skein.ApplicationClient) experiment_obj = 'obj' def experiment_f(): return experiment_obj mocked_client.kv.wait.return_value = cloudpickle.dumps(experiment_f) returned_object = _get_experiment(mocked_client) assert returned_object == experiment_obj
def main() -> None: task_type, task_id = cluster.get_task_description() client, cluster_spec, cluster_tasks = _prepare_container() # Variable TF_CONFIG must be set before instantiating # the estimator to train in a distributed way cluster.setup_tf_config(cluster_spec) experiment = _get_experiment(client) run_config = experiment.config tf.logging.info(f"Starting server {task_type}:{task_id}") cluster.start_tf_server(cluster_spec, run_config.session_config) thread = _execute_dispatched_function(client, experiment) # "ps" tasks do not terminate by themselves. See # https://github.com/tensorflow/tensorflow/issues/4713. # Tensorboard is terminated after all other tasks in _shutdown_container if task_type not in ['ps', 'tensorboard']: thread.join() tf.logging.info(f"{task_type}:{task_id} {thread.state}") _shutdown_container(client, cluster_tasks, run_config, thread)
def _worker_fn(client, task, net_if): event.broadcast(client, f"{task}/addr", net_if[1]) worker_info = event.wait(client, f"chief:0/{net_if[1]}").split(',') driver_socket = event.wait(client, "chief:0/sock_addr").split(':') os.environ['HOROVOD_GLOO_RENDEZVOUS_ADDR'] = driver_socket[0] os.environ['HOROVOD_GLOO_RENDEZVOUS_PORT'] = driver_socket[1] os.environ['HOROVOD_CONTROLLER'] = 'gloo' os.environ['HOROVOD_CPU_OPERATIONS'] = 'gloo' os.environ['HOROVOD_GLOO_IFACE'] = net_if[0] os.environ['HOROVOD_RANK'] = worker_info[0] os.environ['HOROVOD_SIZE'] = worker_info[1] os.environ['HOROVOD_LOCAL_RANK'] = worker_info[2] os.environ['HOROVOD_LOCAL_SIZE'] = worker_info[3] os.environ['HOROVOD_CROSS_RANK'] = worker_info[4] os.environ['HOROVOD_CROSS_SIZE'] = worker_info[5] hvd.init() experiment = _task_commons._get_experiment(client) if task != 'chief:0': # Overwrite config to do nothing but training to improve training speed experiment.estimator._model_dir = "." new_config = experiment.estimator.config.replace( save_summary_steps=None, save_checkpoints_steps=None, save_checkpoints_secs=None, log_step_count_steps=None) experiment.estimator._config = new_config logger.info("start training..") experiment.estimator.train(experiment.train_spec.input_fn, hooks=experiment.train_spec.hooks, max_steps=experiment.train_spec.max_steps)
def _train(device: int, rank: int, world_size: int, collective_ops_backend: str) -> None: os.environ["NCCL_DEBUG"] = "INFO" _logger.info(f"[{os.getpid()}] device: {device}; rank: {rank}") os.environ[PYTORCH_DPP_RANK] = str(rank) client = skein.ApplicationClient.from_current() _setup_master(client, rank) dist.init_process_group(collective_ops_backend, rank=rank, world_size=world_size) experiment = _get_experiment(client) assert isinstance(experiment, PytorchExperiment) model = experiment.model.to(device) ddp_kwargs = experiment.ddp_args._asdict() if experiment.ddp_args else {} ddp_model = DDP(model, device_ids=[device], **ddp_kwargs) trainloader = _create_dataloader(experiment.train_dataset, experiment.dataloader_args) with tempfile.TemporaryDirectory() as tmp: tb_writer = SummaryWriter(tmp) experiment.main_fn(ddp_model, trainloader, f"cuda:{device}", rank, tb_writer) tb_writer.flush() tb_writer.close() if experiment.tensorboard_hdfs_dir: worker_tb_dir = os.path.join(experiment.tensorboard_hdfs_dir, f"worker{rank}") _upload_tensorboard_on_hdfs(tmp, worker_tb_dir) dist.destroy_process_group() _logger.info("Done training")
def evaluator_fn(client): experiment = _task_commons._get_experiment(client) evaluate(experiment, timeout_in_secs=1200) # Timeout after 20min