예제 #1
0
def train_model(opt):
    """
    Run through a TrainLoop.

    If model_file is not in opt, then this helper will create a temporary
    directory to store the model, dict, etc.

    :return: (stdout, valid_results, test_results)
    :rtype: (str, dict, dict)
    """
    import parlai.scripts.train_model as tms

    with capture_output() as output:
        with tempdir() as tmpdir:
            if 'model_file' not in opt:
                opt['model_file'] = os.path.join(tmpdir, 'model')
            if 'dict_file' not in opt:
                opt['dict_file'] = os.path.join(tmpdir, 'model.dict')
            parser = tms.setup_args()
            # needed at the very least to set the overrides.
            parser.set_params(**opt)
            parser.set_params(log_every_n_secs=10)
            popt = parser.parse_args(print_args=False)
            # in some rare cases, like for instance if the model class also
            # overrides its default params, the params override will not
            # be taken into account.
            for k, v in opt.items():
                popt[k] = v
            tl = tms.TrainLoop(popt)
            valid, test = tl.train()

    return (output.getvalue(), valid, test)
예제 #2
0
        def get_tl(tmpdir):
            final_opt = Opt({
                'task': 'integration_tests',
                'datatype': 'valid',
                'validation_max_exs': 30,
                'short_final_eval': True,
            })
            final_opt.save(os.path.join(tmpdir, "final_opt.opt"))

            opt = Opt({
                'task':
                'integration_tests',
                'validation_max_exs':
                10,
                'model':
                'repeat_label',
                'model_file':
                os.path.join(tmpdir, 'model'),
                'short_final_eval':
                True,
                'num_epochs':
                1.0,
                'final_extra_opt':
                str(os.path.join(tmpdir, "final_opt.opt")),
            })
            parser = tms.setup_args()
            parser.set_params(**opt)
            popt = parser.parse_args([])
            for k, v in opt.items():
                popt[k] = v
            return tms.TrainLoop(popt)
예제 #3
0
 def get_popt_and_tl(opt):
     parser = tms.setup_args()
     parser.set_params(**opt)
     popt = parser.parse_args([], print_args=False)
     for k, v in opt.items():
         popt[k] = v
     return popt, tms.TrainLoop(popt)
def train_model(opt):
    """
    Runs through a TrainLoop.

    If model_file is not in opt, then this helper will create a temporary
    directory to store the model, dict, etc.

    :return: (stdout, stderr, valid_results, test_results)
    :rtype: (str, str, dict, dict)
    """
    import parlai.scripts.train_model as tms

    with capture_output() as output:
        with tempdir() as tmpdir:
            if 'model_file' not in opt:
                opt['model_file'] = os.path.join(tmpdir, 'model')
            if 'dict_file' not in opt:
                opt['dict_file'] = os.path.join(tmpdir, 'model.dict')
            parser = tms.setup_args()
            parser.set_params(**opt)
            popt = parser.parse_args(print_args=False)
            tl = tms.TrainLoop(popt)
            valid, test = tl.train()

    return (
        output.getvalue(),
        valid,
        test,
    )
예제 #5
0
 def run(self):
     with distributed_utils.slurm_distributed_context(self.opt) as opt:
         self.train_loop = single_train.TrainLoop(opt)
         self.parser = self.parser
         self.parser.opt = self.train_loop.agent.opt
         self.parser.print_args()
         return self.train_loop.train()
예제 #6
0
def multiprocess_train(
    rank, opt, port=61337, rank_offset=0, gpu=None, hostname='localhost'
):
    """
    Subprocess which initializes distributed training, and begins training.

    This should be launched n times for n GPUs; this is handled either in main
    or via srun.

    :param int rank: This process's rank - 1. (Starts at -1 ... n - 2). See comments.
    :param opt: command line options
    :param int port: A TCP port to use. This will need to be changed to run
        multiple distributed training setups on the same machine.
    :param int gpu: Which GPU to use. Defaults to using rank and local devices,
        but must be manually specified when using many-hosts.
    :param str hostname: Hostname of the main server.
    """
    # Set per-host options
    opt = copy.deepcopy(opt)
    # we need to manually adjust the rank differently in multiprocessing
    # and distributed train
    rank = rank + rank_offset
    opt['rank'] = rank
    if gpu is None:
        # default assumption is local GPUs
        gpu = rank % torch.cuda.device_count()
    opt['gpu'] = gpu
    # make sure we don't just use whatever GPU was saved in the model file
    if 'override' not in opt:
        opt['override'] = {}
    opt['override']['gpu'] = gpu

    # Suppress output of workers except the main host.
    if opt.get('verbose') or rank != 0:
        print_prefix = '[rank:{:3d}]'.format(rank)
    else:
        print_prefix = None
    suppress_output = not opt.get('verbose') and rank != 0

    with distributed_utils.override_print(suppress_output, print_prefix):
        # perform distributed setup, ensuring all hosts are ready
        if opt['gpu'] != -1:
            torch.cuda.set_device(opt['gpu'])
        dist.init_process_group(
            backend="nccl",
            init_method="tcp://{}:{}".format(hostname, port),
            world_size=opt['distributed_world_size'],
            rank=rank,
        )
        logging.info("Distributed group initialized")

        # manual_seed can be a noop without this
        torch.cuda.init()
        # make sure all parameters will be in sync
        torch.manual_seed(42)
        # force a sync so that no one gets ahead, and all are seeded together
        distributed_utils.sync_object(None)

        # Run the actual training
        return single_train.TrainLoop(opt).train()
예제 #7
0
def multiprocess_train(
    rank, opt, port=61337, rank_offset=0, gpu=None, hostname='localhost'
):
    with distributed_utils.distributed_context(
        rank, opt, port, rank_offset, gpu, hostname
    ) as opt:
        # Run the actual training
        opt['multiprocessing'] = True
        return single_train.TrainLoop(opt).train()
    def test_train_model_with_no_dict_file(self):
        """Ensure training a model requires a dict_file or model_file."""
        import parlai.scripts.train_model as tms

        with testing_utils.capture_output():
            parser = tms.setup_args()
            parser.set_params(task='babi:task1k:1', model='seq2seq')
            popt = parser.parse_args(print_args=False)
            with self.assertRaises(RuntimeError):
                tms.TrainLoop(popt)
예제 #9
0
 def test_train_model_with_no_dict_file(self):
     """Check that attempting to train a model without specifying a dict_file
     or model_file fails
     """
     import parlai.scripts.train_model as tms
     with testing_utils.capture_output():
         parser = tms.setup_args()
         parser.set_params(task='babi:task1k:1', model='seq2seq')
         popt = parser.parse_args(print_args=False)
         with self.assertRaises(RuntimeError):
             tms.TrainLoop(popt)
def multiprocess_train(rank, opt, port=61337, gpu=None, hostname='localhost'):
    """
    Subprocess which initializes distributed training, and begins training.

    This should be launched n times for n GPUs; this is handled either in main
    or via srun.

    :param int rank: This process's rank
    :param opt: command line options
    :param int port: A TCP port to use. This will need to be changed to run
        multiple distributed training setups on the same machine.
    :param int gpu: Which GPU to use. Defaults to using rank and local devices,
        but must be manually specified when using many-hosts.
    :param str hostname: Hostname of the main server.
    """
    # Set per-host options
    opt = copy.deepcopy(opt)
    opt['rank'] = rank
    if gpu is None:
        # default assumption is local GPUs
        gpu = rank % torch.cuda.device_count()
    opt['gpu'] = gpu
    # make sure we don't just use whatever GPU was saved in the model file
    if 'override' not in opt:
        opt['override'] = {}
    opt['override']['gpu'] = gpu

    # Suppress output of workers except the main host.
    if opt.get('verbose') or rank != 0:
        print_prefix = '[rank:{:2d}]'.format(rank)
    else:
        print_prefix = None
    distributed_utils.override_print(
        suppress=(not opt.get('verbose') and rank != 0),
        prefix=print_prefix
    )

    # perform distributed setup, ensuring all hosts are ready
    torch.cuda.set_device(opt['gpu'])
    dist.init_process_group(
        backend="nccl",
        init_method="tcp://{}:{}".format(hostname, port),
        world_size=opt['distributed_world_size'],
        rank=rank,
    )
    print("Distributed group initialized")

    # Run the actual training
    return single_train.TrainLoop(opt).train()
예제 #11
0
 def get_tl(tmpdir):
     opt = {
         'task': 'integration_tests',
         'model': 'parlai.agents.test_agents.test_agents:MockTrainUpdatesAgent',
         'model_file': os.path.join(tmpdir, 'model'),
         'dict_file': os.path.join(tmpdir, 'model.dict'),
         # step opts
         'max_train_steps': num_train_steps,
         'validation_every_n_steps': int(num_train_steps / num_validations),
         'log_every_n_steps': int(num_train_steps / num_logs),
         'update_freq': update_freq,
     }
     parser = tms.setup_args()
     parser.set_params(**opt)
     popt = parser.parse_args([])
     for k, v in opt.items():
         popt[k] = v
     return tms.TrainLoop(popt)
예제 #12
0
def multiprocess_train(rank,
                       opt,
                       port=61337,
                       rank_offset=0,
                       gpu=None,
                       hostname='localhost'):
    init_method = f"tcp://{hostname}:{port}"
    with distributed_utils.distributed_context(rank,
                                               opt,
                                               rank_offset,
                                               gpu,
                                               init_method=init_method) as opt:
        # Run the actual training
        opt['multiprocessing'] = True
        try:
            return single_train.TrainLoop(opt).train()
        except Exception:
            import parlai.utils.logging as logging

            logging.critical(traceback.format_exc())
            logging.critical(
                f"Got the above exception on worker {rank + rank_offset}. "
                "This may cause hangs requiring manual killing of processes.")
            raise
예제 #13
0
 def run(self):
     with distributed_utils.slurm_distributed_context(self.opt) as opt:
         self.train_loop = single_train.TrainLoop(opt)
         return self.train_loop.train()
예제 #14
0
<<<<<<< HEAD
            popt = parser.parse_args([], print_args=False)
=======
            popt = parser.parse_args(print_args=False)
>>>>>>> 4f6b99642d60aff1a41b9eae8bd2ccd9e40ebba4
>>>>>>> origin/master
=======
            popt = parser.parse_args(print_args=False)
>>>>>>> 4f6b99642d60aff1a41b9eae8bd2ccd9e40ebba4
>>>>>>> ef574cebef2a8d5aa38b73176b1e71a919d6670f
            # in some rare cases, like for instance if the model class also
            # overrides its default params, the params override will not
            # be taken into account.
            for k, v in opt.items():
                popt[k] = v
            tl = tms.TrainLoop(popt)
            valid, test = tl.train()

    return (output.getvalue(), valid, test)
=======
    with tempdir() as tmpdir:
        if 'model_file' not in opt:
            opt['model_file'] = os.path.join(tmpdir, 'model')
        if 'dict_file' not in opt:
            opt['dict_file'] = os.path.join(tmpdir, 'model.dict')
        parser = tms.setup_args()
        # needed at the very least to set the overrides.
        parser.set_params(**opt)
        parser.set_params(log_every_n_secs=10)
        popt = parser.parse_args([], print_args=False)
        # in some rare cases, like for instance if the model class also