Exemplo n.º 1
0
def multiprocess_train(
    rank, opt, port=61337, rank_offset=0, gpu=None, hostname='localhost'
):
    with distributed_utils.distributed_context(
        rank, opt, port, rank_offset, gpu, hostname
    ) as opt:
        # Run the actual training
        opt['multiprocessing'] = True
        return single_train.TrainLoop(opt).train()
Exemplo n.º 2
0
def multiprocess_eval(
    rank, opt, port=61337, rank_offset=0, gpu=None, hostname='localhost'
):
    """
    Run a multiprocessing evaluation.

    Invoked by launch_and_eval, not instantiated directly.
    """
    with distributed_utils.distributed_context(
        rank, opt, port, rank_offset, gpu, hostname
    ) as opt:
        return eval_model.eval_model(opt)
Exemplo n.º 3
0
def multiprocess_eval(
    rank, opt, port=61337, rank_offset=0, gpu=None, hostname='localhost'
):
    """
    Run a multiprocessing evaluation.

    Invoked by launch_and_eval, not instantiated directly.
    """
    init_method = f'tcp://{hostname}:{port}'
    with distributed_utils.distributed_context(
        rank, opt, rank_offset, gpu, init_method=init_method
    ) as opt:
        opt['multiprocessing'] = True
        return eval_model.eval_model(opt)
Exemplo n.º 4
0
def multiprocess_train(rank,
                       opt,
                       port=61337,
                       rank_offset=0,
                       gpu=None,
                       hostname='localhost'):
    init_method = f"tcp://{hostname}:{port}"
    with distributed_utils.distributed_context(rank,
                                               opt,
                                               rank_offset,
                                               gpu,
                                               init_method=init_method) as opt:
        # Run the actual training
        opt['multiprocessing'] = True
        try:
            return single_train.TrainLoop(opt).train()
        except Exception:
            import parlai.utils.logging as logging

            logging.critical(traceback.format_exc())
            logging.critical(
                f"Got the above exception on worker {rank + rank_offset}. "
                "This may cause hangs requiring manual killing of processes.")
            raise