示例#1
0
def start_jobs():
    n_nodes = len([s for s in SERVERS])
    run_uuid = experiment.generate_uuid()
    master_addr = None
    world_size = n_nodes * PROC_PER_NODE

    for node_rank, server in enumerate(SERVERS):
        if master_addr is None:
            master_addr = SERVERS[server].conf.hostname

        for local_rank in range(PROC_PER_NODE):
            rank = node_rank * PROC_PER_NODE + local_rank
            env_vars = {'GLOO_SOCKET_IFNAME': 'enp1s0',
                        'RUN_UUID': run_uuid,
                        'MASTER_ADDR': master_addr,
                        'MASTER_PORT': f'{1234}',
                        'WORLD_SIZE': f'{world_size}',
                        'RANK': f'{rank}',
                        'LOCAL_RANK': f'{local_rank}'}

            if PROC_PER_NODE > 1:
                env_vars['OMP_NUM_THREADS'] = '1'

            cmd = ['python', 'mnist.py']

            tags = TAGS.copy()
            if node_rank == 0 and local_rank == 0:
                tags += ['master']

            JOBS.create(server, ' '.join(cmd), env_vars, tags).start()
            time.sleep(1)
示例#2
0
def launch(python_cmd: str,
           *,
           tags: List[str],
           n_proc_per_node: int,
           use_env: bool = False,
           master_port: int = 1234,
           env_vars: Optional[Dict[str, str]] = None):
    n_nodes = len([s for s in SERVERS])

    run_uuid = experiment.generate_uuid()
    master_addr = None
    world_size = n_nodes * n_proc_per_node

    if env_vars is None:
        env_vars = {}

    for node_rank, server in enumerate(SERVERS):
        if master_addr is None:
            master_addr = SERVERS[server].conf.hostname

        for local_rank in range(n_proc_per_node):
            rank = node_rank * n_proc_per_node + local_rank
            proc_env_vars = {
                'RUN_UUID': run_uuid,
                'MASTER_ADDR': master_addr,
                'MASTER_PORT': f'{master_port}',
                'WORLD_SIZE': f'{world_size}',
                'NODE_RANK': f'{node_rank}',
                'RANK': f'{rank}',
                'LOCAL_RANK': f'{local_rank}'
            }

            if n_proc_per_node > 1:
                proc_env_vars['OMP_NUM_THREADS'] = '1'

            proc_env_vars.update(env_vars)
            cmd = ['python', python_cmd]

            if not use_env:
                cmd += [f'--local_rank={local_rank}']

            proc_tags = tags.copy()
            if node_rank == 0 and local_rank == 0:
                proc_tags += ['master']

            JOBS.create(server, ' '.join(cmd), proc_env_vars,
                        proc_tags).start()
            time.sleep(1)
示例#3
0
def job_tail(job: str, tag: List[str], delay: int):
    """Tail job output"""
    jobs = util.get_jobs(job, tag)
    if len(jobs) > 1:
        click.echo(f"Selecting a job out of {len(jobs)}")
    job = jobs[0]

    _job = JOBS.by_key(job)
    logger.log(util.log_job(_job))
    _job.tail()
    if delay <= 0:
        return
    while not _job.stopped:
        with monit.section('rsync', is_silent=True):
            _job.server.rsync_jobs(ui_mode=UIMode.none, is_silent=True)
        time.sleep(0.5)
        _job.update_stopped()
        _job.tail()
        time.sleep(delay)
示例#4
0
def job_list(is_rsync_before: bool, show_stopped: bool, show_hidden: bool,
             tag: List[str]):
    """Show list of jobs"""
    if is_rsync_before:
        for k in util.get_servers(''):
            SERVERS[k].rsync_jobs()
        time.sleep(0.5)

    for _job in JOBS.all():
        matched = True
        for t in tag:
            if t not in _job.tags:
                matched = False
        if not matched:
            continue
        if not show_hidden and '__hidden__' in _job.tags:
            continue
        if not show_stopped and not _job.running:
            continue

        logger.log(util.log_job(_job))
示例#5
0
def job_kill(job: List[str], tag: List[str], signal: str):
    """Kill jobs"""
    if not job and not tag:
        if not click.confirm("Killing all jobs. Do you want to continue?"):
            return 1

    jobs = util.get_jobs(job, tag, is_master=False)

    for j in jobs:
        _job = JOBS.by_key(j)
        if not _job.running:
            continue
        log_parts = util.log_job(_job)
        res = _job.server.shell(f'kill -{signal} {_job.pid}')
        if res.exit_code == 0:
            log_parts += [(' KILLED', Text.success)]
            logger.log(log_parts)
        else:
            log_parts += [(' FAILED', Text.success)]
            logger.log(log_parts)
            logger.log(res.out)
            logger.log(res.err, Text.warning)
示例#6
0
import time

from labml import experiment
from labml_remote.job import JOBS
from labml_remote.server import SERVERS

PROC_PER_NODE = 1
N_NODES = len([s for s in SERVERS])

run_uuid = experiment.generate_uuid()
master_addr = None

for i, server in enumerate(SERVERS):
    if master_addr is None:
        master_addr = SERVERS[server].conf.hostname

    cmd = f'python -m torch.distributed.launch ' \
          f'--nproc_per_node={PROC_PER_NODE} ' \
          f'--nnodes={N_NODES} ' \
          f'--node_rank={i} ' \
          f'--master_addr={master_addr} --master_port=1234 ' \
          f'mnist.py'

    env_vars = {'GLOO_SOCKET_IFNAME': 'enp1s0', 'RUN_UUID': run_uuid}
    tags = ['mnist']
    if i == 0:
        tags += ['master']
    JOBS.create(server, cmd, env_vars, tags).start()
    time.sleep(1)
示例#7
0
def job_run(server: str, cmd: str, env: List[Tuple[str, str]], tag: List[str]):
    """Start a job"""
    if not tag:
        tag = ['custom', 'no-tags']
    JOBS.create(server, cmd, util.get_env_dict(env), tag).start()
示例#8
0
        for t in tag:
            if t not in _job.tags:
                matched = False
        if not matched:
            continue
        if not show_hidden and '__hidden__' in _job.tags:
            continue
        if not show_stopped and not _job.running:
            continue

        logger.log(util.log_job(_job))


@click.command()
@click.option('--job',
              type=click.Choice(list(JOBS.job_keys())),
              help='Job to tail')
@click.option('--tag',
              multiple=True,
              type=click.STRING,
              help='Find job to tail by tags')
@click.option('--delay',
              type=click.INT,
              default=5,
              help="Refresh delay. 0 not to watch")
def job_tail(job: str, tag: List[str], delay: int):
    """Tail job output"""
    jobs = util.get_jobs(job, tag)
    if len(jobs) > 1:
        click.echo(f"Selecting a job out of {len(jobs)}")
    job = jobs[0]
示例#9
0
for i, server in enumerate(SERVERS):
    if master_addr is None:
        master_addr = SERVERS[server].conf.hostname

    for local_rank in range(PROC_PER_NODE):
        rank = i * PROC_PER_NODE + local_rank
        env_vars = {
            'GLOO_SOCKET_IFNAME': 'enp1s0',
            'RUN_UUID': run_uuid,
            'MASTER_ADDR': master_addr,
            'MASTER_PORT': f'{MASTER_PORT}',
            'WORLD_SIZE': f'{world_size}',
            'RANK': f'{rank}',
            'LOCAL_RANK': f'{local_rank}'
        }

        if PROC_PER_NODE > 1:
            env_vars['OMP_NUM_THREADS'] = '1'

        cmd = ['python', 'mnist.py']

        if not USE_ENV:
            cmd += [f'--local_rank={local_rank}']

        tags = ['mnist']
        if i == 0 and local_rank == 0:
            tags += ['master']

        JOBS.create(server, ' '.join(cmd), env_vars, tags).start()
        time.sleep(1)
示例#10
0
from labml_remote.job import JOBS
from labml_remote.server import SERVERS

server = next(iter(SERVERS))
JOBS.create(server, 'python hello_world_sleep.py', {},
            ['hello', 'master']).start()
JOBS.create(server, 'python hello_world_sleep.py', {}, ['hello']).start()