예제 #1
0
def test_worker_exits_nonzero_code_ng(nprocs, exitcode):
    for combination in itertools.product(range(2), repeat=nprocs):
        n_activated = sum(combination)
        if n_activated != 1 and n_activated != nprocs:
            # skip.
            continue
        if n_activated == 1:
            exit_idx = combination.index(1)
        else:
            exit_idx = None
        args = None

        def simple_func(args):
            # NOP
            rank = os.environ.get("RANK", None)
            assert rank is not None
            rank = int(rank)
            if n_activated == 1 and rank != exit_idx:
                return
            sys.exit(exitcode)

        with pytest.raises(WorkerError) as excinfo:
            launch(simple_func, args, nprocs)
        assert excinfo.value.exitcode == exitcode
        if n_activated == 1:
            assert excinfo.value.worker_id == exit_idx
예제 #2
0
def test_simple_function_ok_with_right_envvar(nprocs):
    queue = Queue()

    def simple_func(queue):
        worldsize = os.environ.get("WORLD_SIZE", None)
        rank = os.environ.get("RANK", None)
        localrank = os.environ.get("LOCAL_RANK", None)
        assert worldsize is not None
        assert rank is not None
        assert localrank is not None
        queue.put({
            "worldsize": int(worldsize),
            "rank": int(rank),
            "localrank": int(localrank),
        })
        return 0

    launch(simple_func, queue, nprocs)

    results = [queue.get() for _ in range(nprocs)]
    pids = set(range(nprocs))
    for r in results:
        worldsize = r["worldsize"]
        rank = r["rank"]
        localrank = r["localrank"]
        assert worldsize == nprocs
        assert rank in pids
        assert localrank in pids
        pids.remove(rank)
    assert len(pids) == 0
    assert queue.empty()
예제 #3
0
def test_simple_function_ok(nprocs):
    args = None

    def simple_func(args):
        # NOP
        return 0

    launch(simple_func, args, nprocs)
예제 #4
0
def test_simple_function_ok_with_args(nprocs):
    args = argparse.Namespace(**{f"param{v}": v for v in range(10)})

    def simple_func(args):
        args_dict = vars(args)
        for v in range(10):
            key = f"param{v}"
            assert key in args_dict
            assert args_dict[key] == v
        return 0

    launch(simple_func, args, nprocs)
예제 #5
0
def main(cmd_args):
    """Run the main training function."""
    parser = get_parser()
    args, _ = parser.parse_known_args(cmd_args)
    if args.backend == "chainer" and args.train_dtype != "float32":
        raise NotImplementedError(
            f"chainer backend does not support --train-dtype {args.train_dtype}."
            "Use --dtype float32."
        )
    if args.ngpu == 0 and args.train_dtype in ("O0", "O1", "O2", "O3", "float16"):
        raise ValueError(
            f"--train-dtype {args.train_dtype} does not support the CPU backend."
        )

    from espnet.utils.dynamic_import import dynamic_import

    if args.model_module is None:
        if args.num_spkrs == 1:
            model_module = "espnet.nets." + args.backend + "_backend.e2e_asr:E2E"
        else:
            model_module = "espnet.nets." + args.backend + "_backend.e2e_asr_mix:E2E"
    else:
        model_module = args.model_module
    model_class = dynamic_import(model_module)
    model_class.add_arguments(parser)

    args = parser.parse_args(cmd_args)
    args.model_module = model_module
    if "chainer_backend" in args.model_module:
        args.backend = "chainer"
    if "pytorch_backend" in args.model_module:
        args.backend = "pytorch"

    # add version info in args
    args.version = __version__

    # logging info
    setup_logging(args.verbose)

    # If --ngpu is not given,
    #   1. if CUDA_VISIBLE_DEVICES is set, all visible devices
    #   2. if nvidia-smi exists, use all devices
    #   3. else ngpu=0
    if args.ngpu is None:
        cvd = os.environ.get("CUDA_VISIBLE_DEVICES")
        if cvd is not None:
            ngpu = len(cvd.split(","))
        else:
            logging.warning("CUDA_VISIBLE_DEVICES is not set.")
            try:
                p = subprocess.run(
                    ["nvidia-smi", "-L"], stdout=subprocess.PIPE, stderr=subprocess.PIPE
                )
            except (subprocess.CalledProcessError, FileNotFoundError):
                ngpu = 0
            else:
                ngpu = len(p.stderr.decode().split("\n")) - 1
    else:
        if args.ngpu != 1:
            logging.debug(
                "There are some bugs with multi-GPU processing in PyTorch 1.2+"
                + " (see https://github.com/pytorch/pytorch/issues/21108)"
            )
        ngpu = args.ngpu
    if args.use_ddp and ngpu <= 0:
        raise ValueError("DDP requires at least 1 GPU.")
    logging.info(f"ngpu: {ngpu}")

    # display PYTHONPATH
    logging.info("python path = " + os.environ.get("PYTHONPATH", "(None)"))

    # set random seed
    logging.info("random seed = %d" % args.seed)
    random.seed(args.seed)
    np.random.seed(args.seed)

    # load dictionary for debug log
    if args.dict is not None:
        with open(args.dict, "rb") as f:
            dictionary = f.readlines()
        char_list = [entry.decode("utf-8").split(" ")[0] for entry in dictionary]
        char_list.insert(0, "<blank>")
        char_list.append("<eos>")
        # for non-autoregressive maskctc model
        if "maskctc" in args.model_module:
            char_list.append("<mask>")
        args.char_list = char_list
    else:
        args.char_list = None

    # train
    logging.info("backend = " + args.backend)

    if args.use_ddp:
        # When using DDP, only PyTorch is supported.
        # Chainer is out-of-scope.
        if args.num_spkrs == 1:
            if args.backend == "chainer":
                raise ValueError("Chainer with DDP is not supported.")
            from espnet.distributed.pytorch_backend.launch import (
                launch,
                set_start_method,
            )

            # NOTE: it's necessary to set "spawn" as a multiprocessing
            # start method. Because, in this use case, CUDA initialization
            # procedure has been already done, but CUDA context can't be
            # shared with processes.
            # By default, multiprocessing tries to launch a process with
            # "fork" method. But, it will make processes which share
            # memory address spaces with a parent process.
            # To ensure a separate memory space, "spawn" method is required.
            set_start_method("spawn")
            launch(_reinitialize_logging_and_call_train, args, args.ngpu)
        else:
            raise ValueError("Single speaker is only supported when using DDP.")
    else:
        if args.num_spkrs == 1:
            if args.backend == "chainer":
                from espnet.asr.chainer_backend.asr import train

                train(args)
            elif args.backend == "pytorch":
                from espnet.asr.pytorch_backend.asr import train

                train(args)
            else:
                raise ValueError("Only chainer and pytorch are supported.")
        else:
            # FIXME(kamo): Support --model-module
            if args.backend == "pytorch":
                from espnet.asr.pytorch_backend.asr_mix import train

                train(args)
            else:
                raise ValueError("Only pytorch is supported.")