def start_jobs(): n_nodes = len([s for s in SERVERS]) run_uuid = experiment.generate_uuid() master_addr = None world_size = n_nodes * PROC_PER_NODE for node_rank, server in enumerate(SERVERS): if master_addr is None: master_addr = SERVERS[server].conf.hostname for local_rank in range(PROC_PER_NODE): rank = node_rank * PROC_PER_NODE + local_rank env_vars = {'GLOO_SOCKET_IFNAME': 'enp1s0', 'RUN_UUID': run_uuid, 'MASTER_ADDR': master_addr, 'MASTER_PORT': f'{1234}', 'WORLD_SIZE': f'{world_size}', 'RANK': f'{rank}', 'LOCAL_RANK': f'{local_rank}'} if PROC_PER_NODE > 1: env_vars['OMP_NUM_THREADS'] = '1' cmd = ['python', 'mnist.py'] tags = TAGS.copy() if node_rank == 0 and local_rank == 0: tags += ['master'] JOBS.create(server, ' '.join(cmd), env_vars, tags).start() time.sleep(1)
def launch(python_cmd: str, *, tags: List[str], n_proc_per_node: int, use_env: bool = False, master_port: int = 1234, env_vars: Optional[Dict[str, str]] = None): n_nodes = len([s for s in SERVERS]) run_uuid = experiment.generate_uuid() master_addr = None world_size = n_nodes * n_proc_per_node if env_vars is None: env_vars = {} for node_rank, server in enumerate(SERVERS): if master_addr is None: master_addr = SERVERS[server].conf.hostname for local_rank in range(n_proc_per_node): rank = node_rank * n_proc_per_node + local_rank proc_env_vars = { 'RUN_UUID': run_uuid, 'MASTER_ADDR': master_addr, 'MASTER_PORT': f'{master_port}', 'WORLD_SIZE': f'{world_size}', 'NODE_RANK': f'{node_rank}', 'RANK': f'{rank}', 'LOCAL_RANK': f'{local_rank}' } if n_proc_per_node > 1: proc_env_vars['OMP_NUM_THREADS'] = '1' proc_env_vars.update(env_vars) cmd = ['python', python_cmd] if not use_env: cmd += [f'--local_rank={local_rank}'] proc_tags = tags.copy() if node_rank == 0 and local_rank == 0: proc_tags += ['master'] JOBS.create(server, ' '.join(cmd), proc_env_vars, proc_tags).start() time.sleep(1)
def job_tail(job: str, tag: List[str], delay: int): """Tail job output""" jobs = util.get_jobs(job, tag) if len(jobs) > 1: click.echo(f"Selecting a job out of {len(jobs)}") job = jobs[0] _job = JOBS.by_key(job) logger.log(util.log_job(_job)) _job.tail() if delay <= 0: return while not _job.stopped: with monit.section('rsync', is_silent=True): _job.server.rsync_jobs(ui_mode=UIMode.none, is_silent=True) time.sleep(0.5) _job.update_stopped() _job.tail() time.sleep(delay)
def job_list(is_rsync_before: bool, show_stopped: bool, show_hidden: bool, tag: List[str]): """Show list of jobs""" if is_rsync_before: for k in util.get_servers(''): SERVERS[k].rsync_jobs() time.sleep(0.5) for _job in JOBS.all(): matched = True for t in tag: if t not in _job.tags: matched = False if not matched: continue if not show_hidden and '__hidden__' in _job.tags: continue if not show_stopped and not _job.running: continue logger.log(util.log_job(_job))
def job_kill(job: List[str], tag: List[str], signal: str): """Kill jobs""" if not job and not tag: if not click.confirm("Killing all jobs. Do you want to continue?"): return 1 jobs = util.get_jobs(job, tag, is_master=False) for j in jobs: _job = JOBS.by_key(j) if not _job.running: continue log_parts = util.log_job(_job) res = _job.server.shell(f'kill -{signal} {_job.pid}') if res.exit_code == 0: log_parts += [(' KILLED', Text.success)] logger.log(log_parts) else: log_parts += [(' FAILED', Text.success)] logger.log(log_parts) logger.log(res.out) logger.log(res.err, Text.warning)
import time from labml import experiment from labml_remote.job import JOBS from labml_remote.server import SERVERS PROC_PER_NODE = 1 N_NODES = len([s for s in SERVERS]) run_uuid = experiment.generate_uuid() master_addr = None for i, server in enumerate(SERVERS): if master_addr is None: master_addr = SERVERS[server].conf.hostname cmd = f'python -m torch.distributed.launch ' \ f'--nproc_per_node={PROC_PER_NODE} ' \ f'--nnodes={N_NODES} ' \ f'--node_rank={i} ' \ f'--master_addr={master_addr} --master_port=1234 ' \ f'mnist.py' env_vars = {'GLOO_SOCKET_IFNAME': 'enp1s0', 'RUN_UUID': run_uuid} tags = ['mnist'] if i == 0: tags += ['master'] JOBS.create(server, cmd, env_vars, tags).start() time.sleep(1)
def job_run(server: str, cmd: str, env: List[Tuple[str, str]], tag: List[str]): """Start a job""" if not tag: tag = ['custom', 'no-tags'] JOBS.create(server, cmd, util.get_env_dict(env), tag).start()
for t in tag: if t not in _job.tags: matched = False if not matched: continue if not show_hidden and '__hidden__' in _job.tags: continue if not show_stopped and not _job.running: continue logger.log(util.log_job(_job)) @click.command() @click.option('--job', type=click.Choice(list(JOBS.job_keys())), help='Job to tail') @click.option('--tag', multiple=True, type=click.STRING, help='Find job to tail by tags') @click.option('--delay', type=click.INT, default=5, help="Refresh delay. 0 not to watch") def job_tail(job: str, tag: List[str], delay: int): """Tail job output""" jobs = util.get_jobs(job, tag) if len(jobs) > 1: click.echo(f"Selecting a job out of {len(jobs)}") job = jobs[0]
for i, server in enumerate(SERVERS): if master_addr is None: master_addr = SERVERS[server].conf.hostname for local_rank in range(PROC_PER_NODE): rank = i * PROC_PER_NODE + local_rank env_vars = { 'GLOO_SOCKET_IFNAME': 'enp1s0', 'RUN_UUID': run_uuid, 'MASTER_ADDR': master_addr, 'MASTER_PORT': f'{MASTER_PORT}', 'WORLD_SIZE': f'{world_size}', 'RANK': f'{rank}', 'LOCAL_RANK': f'{local_rank}' } if PROC_PER_NODE > 1: env_vars['OMP_NUM_THREADS'] = '1' cmd = ['python', 'mnist.py'] if not USE_ENV: cmd += [f'--local_rank={local_rank}'] tags = ['mnist'] if i == 0 and local_rank == 0: tags += ['master'] JOBS.create(server, ' '.join(cmd), env_vars, tags).start() time.sleep(1)
from labml_remote.job import JOBS from labml_remote.server import SERVERS server = next(iter(SERVERS)) JOBS.create(server, 'python hello_world_sleep.py', {}, ['hello', 'master']).start() JOBS.create(server, 'python hello_world_sleep.py', {}, ['hello']).start()