예제 #1
0
def _capture(args: List[str]):
    api_caller = ApiCaller("https://api.lab-ml.com/api/v1/track?",
                           {'run_uuid': generate_uuid()})
    api_logs = ApiLogs()
    data = {'name': 'Capture', 'comment': ' '.join(args), 'time': time.time()}

    def _started(url):
        if url is None:
            return None

        logger.log([('Monitor experiment at ', Text.meta), (url, Text.link)])
        webbrowser.open(url)

    api_caller.has_data(SimpleApiDataSource(data, callback=_started))
    api_logs.set_api(api_caller, frequency=0)

    thread = ExecutorThread(' '.join(args), api_logs)
    thread.start()
    thread.join()
    data = {
        'rank': 0,
        'status': 'completed',
        'details': None,
        'time': time.time()
    }

    api_caller.has_data(
        SimpleApiDataSource({
            'status': data,
            'time': time.time()
        }))

    api_caller.stop()
예제 #2
0
def start_jobs():
    n_nodes = len([s for s in SERVERS])
    run_uuid = experiment.generate_uuid()
    master_addr = None
    world_size = n_nodes * PROC_PER_NODE

    for node_rank, server in enumerate(SERVERS):
        if master_addr is None:
            master_addr = SERVERS[server].conf.hostname

        for local_rank in range(PROC_PER_NODE):
            rank = node_rank * PROC_PER_NODE + local_rank
            env_vars = {'GLOO_SOCKET_IFNAME': 'enp1s0',
                        'RUN_UUID': run_uuid,
                        'MASTER_ADDR': master_addr,
                        'MASTER_PORT': f'{1234}',
                        'WORLD_SIZE': f'{world_size}',
                        'RANK': f'{rank}',
                        'LOCAL_RANK': f'{local_rank}'}

            if PROC_PER_NODE > 1:
                env_vars['OMP_NUM_THREADS'] = '1'

            cmd = ['python', 'mnist.py']

            tags = TAGS.copy()
            if node_rank == 0 and local_rank == 0:
                tags += ['master']

            JOBS.create(server, ' '.join(cmd), env_vars, tags).start()
            time.sleep(1)
예제 #3
0
def _launch(args: List[str]):
    import sys
    import os

    if 'RUN_UUID' not in os.environ:
        os.environ['RUN_UUID'] = experiment.generate_uuid()

    cwd = os.getcwd()
    if 'PYTHONPATH' in os.environ:
        python_path = os.environ['PYTHONPATH']
        print(python_path)
        os.environ['PYTHONPATH'] = f"{python_path}:{cwd}:{cwd}/src"
    else:
        os.environ['PYTHONPATH'] = f"{cwd}:{cwd}/src"

    cmd = [sys.executable, '-u', '-m', 'torch.distributed.launch', *args]
    print(cmd)
    try:
        process = subprocess.Popen(cmd, env=os.environ)
        process.wait()
    except Exception as e:
        logger.log('Error starting launcher', Text.danger)
        raise e

    if process.returncode != 0:
        logger.log('Launcher failed', Text.danger)
        raise subprocess.CalledProcessError(returncode=process.returncode,
                                            cmd=cmd)
예제 #4
0
def launch(python_cmd: str,
           *,
           tags: List[str],
           n_proc_per_node: int,
           use_env: bool = False,
           master_port: int = 1234,
           env_vars: Optional[Dict[str, str]] = None):
    n_nodes = len([s for s in SERVERS])

    run_uuid = experiment.generate_uuid()
    master_addr = None
    world_size = n_nodes * n_proc_per_node

    if env_vars is None:
        env_vars = {}

    for node_rank, server in enumerate(SERVERS):
        if master_addr is None:
            master_addr = SERVERS[server].conf.hostname

        for local_rank in range(n_proc_per_node):
            rank = node_rank * n_proc_per_node + local_rank
            proc_env_vars = {
                'RUN_UUID': run_uuid,
                'MASTER_ADDR': master_addr,
                'MASTER_PORT': f'{master_port}',
                'WORLD_SIZE': f'{world_size}',
                'NODE_RANK': f'{node_rank}',
                'RANK': f'{rank}',
                'LOCAL_RANK': f'{local_rank}'
            }

            if n_proc_per_node > 1:
                proc_env_vars['OMP_NUM_THREADS'] = '1'

            proc_env_vars.update(env_vars)
            cmd = ['python', python_cmd]

            if not use_env:
                cmd += [f'--local_rank={local_rank}']

            proc_tags = tags.copy()
            if node_rank == 0 and local_rank == 0:
                proc_tags += ['master']

            JOBS.create(server, ' '.join(cmd), proc_env_vars,
                        proc_tags).start()
            time.sleep(1)
예제 #5
0
def _capture(args: List[str]):
    api_caller = ApiCaller("https://api.labml.ai/api/v1/track?", {'run_uuid': generate_uuid()},
                           timeout_seconds=120)
    api_logs = ApiLogs()
    data = {
        'name': 'Capture',
        'comment': ' '.join(args),
        'time': time.time()
    }

    api_caller.add_handler(ApiUrlHandler(True, 'Monitor output at '))
    api_caller.has_data(SimpleApiDataSource(data))
    api_logs.set_api(api_caller, frequency=0)

    logger.log('Start capturing...', Text.meta)
    if args:
        thread = ExecutorThread(' '.join(args), api_logs)
        thread.start()
        thread.join()
    else:
        buffer = ''
        stdin = sys.stdin
        while stdin.readable():
            data = stdin.read(1)
            if len(data) == 0:
                break
            print(data, end='')
            buffer += data
            if '\n' in buffer or len(buffer) > 100:
                api_logs.outputs(stdout_=buffer)
                buffer = ''
        if len(buffer) > 0:
            api_logs.outputs(stdout_=buffer)

    data = {
        'rank': 0,
        'status': 'completed',
        'details': None,
        'time': time.time()
    }

    api_caller.has_data(SimpleApiDataSource({
        'status': data,
        'time': time.time()
    }))

    api_caller.stop()
예제 #6
0
def main():
    if 'RUN_UUID' not in os.environ:
        os.environ['RUN_UUID'] = experiment.generate_uuid()

    logger.log(str(sys.argv), Text.danger)
    cmd = [
        sys.executable, '-u', '-m', 'torch.distributed.launch', *sys.argv[1:]
    ]
    # print(cmd)
    try:
        process = subprocess.Popen(cmd, env=os.environ)
        # print('wait')
        process.wait()
    except Exception as e:
        logger.log('Error starting launcher', Text.danger)
        raise e

    if process.returncode != 0:
        logger.log('Launcher failed', Text.danger)
        raise subprocess.CalledProcessError(returncode=process.returncode,
                                            cmd=cmd)
예제 #7
0
    import os
    world_size = int(os.environ['WORLD_SIZE'])
    run_uuid = os.environ['RUN_UUID']
    local_rank = int(os.environ['LOCAL_RANK'])
    rank = int(os.environ['RANK'])
    inspect(world_size=os.environ['WORLD_SIZE'],
            run_uuid=os.environ['RUN_UUID'],
            local_rank=os.environ['LOCAL_RANK'],
            rank=os.environ['RANK'],
            master_addr=os.environ['MASTER_ADDR'],
            master_port=os.environ['MASTER_PORT'])
    main(local_rank, rank, world_size, run_uuid, 'env://')


def spawned(rank, world_size, uuid):
    main(rank, rank, world_size, uuid)


if __name__ == '__main__':
    # Run single GPU
    # main(0, 1, experiment.generate_uuid())

    # Spawn multiple GPU
    torch.multiprocessing.spawn(spawned,
                                args=(2, experiment.generate_uuid()),
                                nprocs=2,
                                join=True)

    # Run with `labml launch`, same arguments as `torch.distributed.launch`
    # _launcher()
예제 #8
0
RUN_UUID=fba9eb202cb211eba5beacde48001122 PYTHONPATH="${PYTHONPATH}:$(pwd):$(pwd)/src" python -m torch.distributed.launch --nproc_per_node=2 --nnodes=2 --node_rank=0 --master_addr=104.171.200.181 --master_port=1234 labml_samples/pytorch/ddp/mnist.py
RUN_UUID=fba9eb202cb211eba5beacde48001122 PYTHONPATH="${PYTHONPATH}:$(pwd):$(pwd)/src" python -m torch.distributed.launch --nproc_per_node=2 --nnodes=2 --node_rank=1 --master_addr=104.171.200.181 --master_port=1234 labml_samples/pytorch/ddp/mnist.py

RUN_UUID=fba9eb202cb211eba5beacde48001122 PYTHONPATH="${PYTHONPATH}:$(pwd):$(pwd)/src" python -m torch.distributed.launch --nproc_per_node=2 labml_samples/pytorch/ddp/mnist.py
"""

import time

from labml import experiment
from labml_remote.job import JOBS
from labml_remote.server import SERVERS

PROC_PER_NODE = 1
N_NODES = len([s for s in SERVERS])

run_uuid = experiment.generate_uuid()
master_addr = None

for i, server in enumerate(SERVERS):
    if master_addr is None:
        master_addr = SERVERS[server].conf.hostname

    cmd = f'python -m torch.distributed.launch ' \
          f'--nproc_per_node={PROC_PER_NODE} ' \
          f'--nnodes={N_NODES} ' \
          f'--node_rank={i} ' \
          f'--master_addr={master_addr} --master_port=1234 ' \
          f'mnist.py'

    env_vars = {'GLOO_SOCKET_IFNAME': 'enp1s0', 'RUN_UUID': run_uuid}
    tags = ['mnist']