def train_model(opt): """ Run through a TrainLoop. If model_file is not in opt, then this helper will create a temporary directory to store the model, dict, etc. :return: (stdout, valid_results, test_results) :rtype: (str, dict, dict) """ import parlai.scripts.train_model as tms with capture_output() as output: with tempdir() as tmpdir: if 'model_file' not in opt: opt['model_file'] = os.path.join(tmpdir, 'model') if 'dict_file' not in opt: opt['dict_file'] = os.path.join(tmpdir, 'model.dict') parser = tms.setup_args() # needed at the very least to set the overrides. parser.set_params(**opt) parser.set_params(log_every_n_secs=10) popt = parser.parse_args(print_args=False) # in some rare cases, like for instance if the model class also # overrides its default params, the params override will not # be taken into account. for k, v in opt.items(): popt[k] = v tl = tms.TrainLoop(popt) valid, test = tl.train() return (output.getvalue(), valid, test)
def get_tl(tmpdir): final_opt = Opt({ 'task': 'integration_tests', 'datatype': 'valid', 'validation_max_exs': 30, 'short_final_eval': True, }) final_opt.save(os.path.join(tmpdir, "final_opt.opt")) opt = Opt({ 'task': 'integration_tests', 'validation_max_exs': 10, 'model': 'repeat_label', 'model_file': os.path.join(tmpdir, 'model'), 'short_final_eval': True, 'num_epochs': 1.0, 'final_extra_opt': str(os.path.join(tmpdir, "final_opt.opt")), }) parser = tms.setup_args() parser.set_params(**opt) popt = parser.parse_args([]) for k, v in opt.items(): popt[k] = v return tms.TrainLoop(popt)
def get_popt_and_tl(opt): parser = tms.setup_args() parser.set_params(**opt) popt = parser.parse_args([], print_args=False) for k, v in opt.items(): popt[k] = v return popt, tms.TrainLoop(popt)
def train_model(opt): """ Runs through a TrainLoop. If model_file is not in opt, then this helper will create a temporary directory to store the model, dict, etc. :return: (stdout, stderr, valid_results, test_results) :rtype: (str, str, dict, dict) """ import parlai.scripts.train_model as tms with capture_output() as output: with tempdir() as tmpdir: if 'model_file' not in opt: opt['model_file'] = os.path.join(tmpdir, 'model') if 'dict_file' not in opt: opt['dict_file'] = os.path.join(tmpdir, 'model.dict') parser = tms.setup_args() parser.set_params(**opt) popt = parser.parse_args(print_args=False) tl = tms.TrainLoop(popt) valid, test = tl.train() return ( output.getvalue(), valid, test, )
def run(self): with distributed_utils.slurm_distributed_context(self.opt) as opt: self.train_loop = single_train.TrainLoop(opt) self.parser = self.parser self.parser.opt = self.train_loop.agent.opt self.parser.print_args() return self.train_loop.train()
def multiprocess_train( rank, opt, port=61337, rank_offset=0, gpu=None, hostname='localhost' ): """ Subprocess which initializes distributed training, and begins training. This should be launched n times for n GPUs; this is handled either in main or via srun. :param int rank: This process's rank - 1. (Starts at -1 ... n - 2). See comments. :param opt: command line options :param int port: A TCP port to use. This will need to be changed to run multiple distributed training setups on the same machine. :param int gpu: Which GPU to use. Defaults to using rank and local devices, but must be manually specified when using many-hosts. :param str hostname: Hostname of the main server. """ # Set per-host options opt = copy.deepcopy(opt) # we need to manually adjust the rank differently in multiprocessing # and distributed train rank = rank + rank_offset opt['rank'] = rank if gpu is None: # default assumption is local GPUs gpu = rank % torch.cuda.device_count() opt['gpu'] = gpu # make sure we don't just use whatever GPU was saved in the model file if 'override' not in opt: opt['override'] = {} opt['override']['gpu'] = gpu # Suppress output of workers except the main host. if opt.get('verbose') or rank != 0: print_prefix = '[rank:{:3d}]'.format(rank) else: print_prefix = None suppress_output = not opt.get('verbose') and rank != 0 with distributed_utils.override_print(suppress_output, print_prefix): # perform distributed setup, ensuring all hosts are ready if opt['gpu'] != -1: torch.cuda.set_device(opt['gpu']) dist.init_process_group( backend="nccl", init_method="tcp://{}:{}".format(hostname, port), world_size=opt['distributed_world_size'], rank=rank, ) logging.info("Distributed group initialized") # manual_seed can be a noop without this torch.cuda.init() # make sure all parameters will be in sync torch.manual_seed(42) # force a sync so that no one gets ahead, and all are seeded together distributed_utils.sync_object(None) # Run the actual training return single_train.TrainLoop(opt).train()
def multiprocess_train( rank, opt, port=61337, rank_offset=0, gpu=None, hostname='localhost' ): with distributed_utils.distributed_context( rank, opt, port, rank_offset, gpu, hostname ) as opt: # Run the actual training opt['multiprocessing'] = True return single_train.TrainLoop(opt).train()
def test_train_model_with_no_dict_file(self): """Ensure training a model requires a dict_file or model_file.""" import parlai.scripts.train_model as tms with testing_utils.capture_output(): parser = tms.setup_args() parser.set_params(task='babi:task1k:1', model='seq2seq') popt = parser.parse_args(print_args=False) with self.assertRaises(RuntimeError): tms.TrainLoop(popt)
def test_train_model_with_no_dict_file(self): """Check that attempting to train a model without specifying a dict_file or model_file fails """ import parlai.scripts.train_model as tms with testing_utils.capture_output(): parser = tms.setup_args() parser.set_params(task='babi:task1k:1', model='seq2seq') popt = parser.parse_args(print_args=False) with self.assertRaises(RuntimeError): tms.TrainLoop(popt)
def multiprocess_train(rank, opt, port=61337, gpu=None, hostname='localhost'): """ Subprocess which initializes distributed training, and begins training. This should be launched n times for n GPUs; this is handled either in main or via srun. :param int rank: This process's rank :param opt: command line options :param int port: A TCP port to use. This will need to be changed to run multiple distributed training setups on the same machine. :param int gpu: Which GPU to use. Defaults to using rank and local devices, but must be manually specified when using many-hosts. :param str hostname: Hostname of the main server. """ # Set per-host options opt = copy.deepcopy(opt) opt['rank'] = rank if gpu is None: # default assumption is local GPUs gpu = rank % torch.cuda.device_count() opt['gpu'] = gpu # make sure we don't just use whatever GPU was saved in the model file if 'override' not in opt: opt['override'] = {} opt['override']['gpu'] = gpu # Suppress output of workers except the main host. if opt.get('verbose') or rank != 0: print_prefix = '[rank:{:2d}]'.format(rank) else: print_prefix = None distributed_utils.override_print( suppress=(not opt.get('verbose') and rank != 0), prefix=print_prefix ) # perform distributed setup, ensuring all hosts are ready torch.cuda.set_device(opt['gpu']) dist.init_process_group( backend="nccl", init_method="tcp://{}:{}".format(hostname, port), world_size=opt['distributed_world_size'], rank=rank, ) print("Distributed group initialized") # Run the actual training return single_train.TrainLoop(opt).train()
def get_tl(tmpdir): opt = { 'task': 'integration_tests', 'model': 'parlai.agents.test_agents.test_agents:MockTrainUpdatesAgent', 'model_file': os.path.join(tmpdir, 'model'), 'dict_file': os.path.join(tmpdir, 'model.dict'), # step opts 'max_train_steps': num_train_steps, 'validation_every_n_steps': int(num_train_steps / num_validations), 'log_every_n_steps': int(num_train_steps / num_logs), 'update_freq': update_freq, } parser = tms.setup_args() parser.set_params(**opt) popt = parser.parse_args([]) for k, v in opt.items(): popt[k] = v return tms.TrainLoop(popt)
def multiprocess_train(rank, opt, port=61337, rank_offset=0, gpu=None, hostname='localhost'): init_method = f"tcp://{hostname}:{port}" with distributed_utils.distributed_context(rank, opt, rank_offset, gpu, init_method=init_method) as opt: # Run the actual training opt['multiprocessing'] = True try: return single_train.TrainLoop(opt).train() except Exception: import parlai.utils.logging as logging logging.critical(traceback.format_exc()) logging.critical( f"Got the above exception on worker {rank + rank_offset}. " "This may cause hangs requiring manual killing of processes.") raise
def run(self): with distributed_utils.slurm_distributed_context(self.opt) as opt: self.train_loop = single_train.TrainLoop(opt) return self.train_loop.train()
<<<<<<< HEAD popt = parser.parse_args([], print_args=False) ======= popt = parser.parse_args(print_args=False) >>>>>>> 4f6b99642d60aff1a41b9eae8bd2ccd9e40ebba4 >>>>>>> origin/master ======= popt = parser.parse_args(print_args=False) >>>>>>> 4f6b99642d60aff1a41b9eae8bd2ccd9e40ebba4 >>>>>>> ef574cebef2a8d5aa38b73176b1e71a919d6670f # in some rare cases, like for instance if the model class also # overrides its default params, the params override will not # be taken into account. for k, v in opt.items(): popt[k] = v tl = tms.TrainLoop(popt) valid, test = tl.train() return (output.getvalue(), valid, test) ======= with tempdir() as tmpdir: if 'model_file' not in opt: opt['model_file'] = os.path.join(tmpdir, 'model') if 'dict_file' not in opt: opt['dict_file'] = os.path.join(tmpdir, 'model.dict') parser = tms.setup_args() # needed at the very least to set the overrides. parser.set_params(**opt) parser.set_params(log_every_n_secs=10) popt = parser.parse_args([], print_args=False) # in some rare cases, like for instance if the model class also