def test_reset_tqdm_logger_handlers(self): serialization_dir_a = os.path.join(self.TEST_DIR, "test_a") os.makedirs(serialization_dir_a, exist_ok=True) prepare_global_logging(serialization_dir_a) serialization_dir_b = os.path.join(self.TEST_DIR, "test_b") os.makedirs(serialization_dir_b, exist_ok=True) prepare_global_logging(serialization_dir_b) # Use range(1) to make sure there should be only 2 lines in the file (0% and 100%) for _ in Tqdm.tqdm(range(1)): pass with open(os.path.join(serialization_dir_a, "out.log"), "r") as f: assert len(f.readlines()) == 0 with open(os.path.join(serialization_dir_b, "out.log"), "r") as f: assert len(f.readlines()) == 2
def _train_worker( process_rank: int, params: Params, serialization_dir: Union[str, PathLike], include_package: List[str] = None, dry_run: bool = False, node_rank: int = 0, master_addr: str = "127.0.0.1", master_port: int = 29500, world_size: int = 1, distributed_device_ids: List[int] = None, file_friendly_logging: bool = False, ) -> Optional[Model]: """ Helper to train the configured model/experiment. In distributed mode, this is spawned as a worker process. In a single GPU experiment, this returns the `Model` object and in distributed training, nothing is returned. # Parameters process_rank : `int` The process index that is initialized using the GPU device id. params : `Params` A parameter object specifying an AllenNLP Experiment. serialization_dir : `str` The directory in which to save results and logs. include_package : `List[str]`, optional In distributed mode, since this function would have been spawned as a separate process, the extra imports need to be done again. NOTE: This does not have any effect in single GPU training. dry_run : `bool`, optional (default=`False`) Do not train a model, but create a vocabulary, show dataset statistics and other training information. node_rank : `int`, optional Rank of the node. master_addr : `str`, optional (default=`"127.0.0.1"`) Address of the master node for distributed training. master_port : `str`, optional (default=`"29500"`) Port of the master node for distributed training. world_size : `int`, optional The number of processes involved in distributed training. distributed_device_ids: `List[str]`, optional IDs of the devices used involved in distributed training. file_friendly_logging : `bool`, optional (default=`False`) If `True`, we add newlines to tqdm output, even on an interactive terminal, and we slow down tqdm's output to only once every 10 seconds. # Returns best_model : `Optional[Model]` The model with the best epoch weights or `None` if in distributed training or in dry run. """ common_logging.FILE_FRIENDLY_LOGGING = file_friendly_logging common_logging.prepare_global_logging( serialization_dir, rank=process_rank, world_size=world_size, ) common_util.prepare_environment(params) distributed = world_size > 1 # not using `allennlp.common.util.is_master` as the process group is yet to be initialized master = process_rank == 0 include_package = include_package or [] if distributed: # Since the worker is spawned and not forked, the extra imports need to be done again. # Both the ones from the plugins and the ones from `include_package`. import_plugins() for package_name in include_package: common_util.import_module_and_submodules(package_name) num_procs_per_node = len(distributed_device_ids) # The Unique identifier of the worker process among all the processes in the # distributed training group is computed here. This is used while initializing # the process group using `init_process_group` global_rank = node_rank * num_procs_per_node + process_rank # Number of processes per node is useful to know if a process # is a master in the local node(node in which it is running) os.environ["ALLENNLP_PROCS_PER_NODE"] = str(num_procs_per_node) # In distributed training, the configured device is always going to be a list. # The corresponding gpu id for the particular worker is obtained by picking the id # from the device list with the rank as index gpu_id = distributed_device_ids[process_rank] # type: ignore # Till now, "cuda_device" might not be set in the trainer params. # But a worker trainer needs to only know about its specific GPU id. params["trainer"]["cuda_device"] = gpu_id params["trainer"]["world_size"] = world_size params["trainer"]["distributed"] = True if gpu_id >= 0: torch.cuda.set_device(int(gpu_id)) dist.init_process_group( backend="nccl", init_method=f"tcp://{master_addr}:{master_port}", world_size=world_size, rank=global_rank, ) else: dist.init_process_group( backend="gloo", init_method=f"tcp://{master_addr}:{master_port}", world_size=world_size, rank=global_rank, ) logging.info(f"Process group of world size {world_size} initialized " f"for distributed training in worker {global_rank}") train_loop = TrainModel.from_params( params=params, serialization_dir=serialization_dir, local_rank=process_rank, ) if dry_run: return None try: if distributed: # let the setup get ready for all the workers dist.barrier() metrics = train_loop.run() except KeyboardInterrupt: # if we have completed an epoch, try to create a model archive. if master and os.path.exists( os.path.join(serialization_dir, _DEFAULT_WEIGHTS)): logging.info( "Training interrupted by the user. Attempting to create " "a model archive using the current best epoch weights.") archive_model(serialization_dir) raise if master: train_loop.finish(metrics) if not distributed: return train_loop.model return None
def train_model( params: Params, serialization_dir: Union[str, PathLike], recover: bool = False, force: bool = False, node_rank: int = 0, include_package: List[str] = None, dry_run: bool = False, file_friendly_logging: bool = False, ) -> Optional[Model]: """ Trains the model specified in the given [`Params`](../common/params.md#params) object, using the data and training parameters also specified in that object, and saves the results in `serialization_dir`. # Parameters params : `Params` A parameter object specifying an AllenNLP Experiment. serialization_dir : `str` The directory in which to save results and logs. recover : `bool`, optional (default=`False`) If `True`, we will try to recover a training run from an existing serialization directory. This is only intended for use when something actually crashed during the middle of a run. For continuing training a model on new data, see `Model.from_archive`. force : `bool`, optional (default=`False`) If `True`, we will overwrite the serialization directory if it already exists. node_rank : `int`, optional Rank of the current node in distributed training include_package : `List[str]`, optional In distributed mode, extra packages mentioned will be imported in trainer workers. dry_run : `bool`, optional (default=`False`) Do not train a model, but create a vocabulary, show dataset statistics and other training information. file_friendly_logging : `bool`, optional (default=`False`) If `True`, we add newlines to tqdm output, even on an interactive terminal, and we slow down tqdm's output to only once every 10 seconds. # Returns best_model : `Optional[Model]` The model with the best epoch weights or `None` if in dry run. """ common_logging.FILE_FRIENDLY_LOGGING = file_friendly_logging training_util.create_serialization_dir(params, serialization_dir, recover, force) params.to_file(os.path.join(serialization_dir, CONFIG_NAME)) include_in_archive = params.pop("include_in_archive", None) verify_include_in_archive(include_in_archive) distributed_params = params.params.pop("distributed", None) # If distributed isn't in the config and the config contains strictly # one cuda device, we just run a single training process. if distributed_params is None: model = _train_worker( process_rank=0, params=params, serialization_dir=serialization_dir, include_package=include_package, dry_run=dry_run, file_friendly_logging=file_friendly_logging, ) if not dry_run: archive_model(serialization_dir, include_in_archive=include_in_archive) return model # Otherwise, we are running multiple processes for training. else: common_logging.prepare_global_logging( serialization_dir, rank=0, world_size=1, ) # We are careful here so that we can raise a good error if someone # passed the wrong thing - cuda_devices are required. device_ids = distributed_params.pop("cuda_devices", None) multi_device = isinstance(device_ids, list) and len(device_ids) > 1 num_nodes = distributed_params.pop("num_nodes", 1) if not (multi_device or num_nodes > 1): raise ConfigurationError( "Multiple cuda devices/nodes need to be configured to run distributed training." ) check_for_gpu(device_ids) master_addr = distributed_params.pop("master_address", "127.0.0.1") if master_addr in ("127.0.0.1", "0.0.0.0", "localhost"): # If running locally, we can automatically find an open port if one is not specified. master_port = ( distributed_params.pop("master_port", None) or common_util.find_open_port() ) else: # Otherwise we require that the port be specified. master_port = distributed_params.pop("master_port") num_procs = len(device_ids) world_size = num_nodes * num_procs # Creating `Vocabulary` objects from workers could be problematic since # the data loaders in each worker will yield only `rank` specific # instances. Hence it is safe to construct the vocabulary and write it # to disk before initializing the distributed context. The workers will # load the vocabulary from the path specified. vocab_dir = os.path.join(serialization_dir, "vocabulary") if recover: vocab = Vocabulary.from_files(vocab_dir) else: vocab = training_util.make_vocab_from_params( params.duplicate(), serialization_dir, print_statistics=dry_run ) params["vocabulary"] = { "type": "from_files", "directory": vocab_dir, "padding_token": vocab._padding_token, "oov_token": vocab._oov_token, } logging.info( "Switching to distributed training mode since multiple GPUs are configured | " f"Master is at: {master_addr}:{master_port} | Rank of this node: {node_rank} | " f"Number of workers in this node: {num_procs} | Number of nodes: {num_nodes} | " f"World size: {world_size}" ) mp.spawn( _train_worker, args=( params.duplicate(), serialization_dir, include_package, dry_run, node_rank, master_addr, master_port, world_size, device_ids, file_friendly_logging, include_in_archive, ), nprocs=num_procs, ) if dry_run: return None else: archive_model(serialization_dir, include_in_archive=include_in_archive) model = Model.load(params, serialization_dir) return model
def main(args: argparse.Namespace): for package_name in args.include_package: import_module_and_submodules(package_name) params = Params.from_file(args.param_path, args.overrides) random_seed, numpy_seed, pytorch_seed = 41, 11, 302 if not args.fix: random_seed, numpy_seed, pytorch_seed = random.randint( 0, 999999999), random.randint(0, 999999999), random.randint( 0, 999999999) params["random_seed"] = random_seed params["numpy_seed"] = numpy_seed params["pytorch_seed"] = pytorch_seed prepare_environment(params) serialization_dir = args.serialization_dir create_serialization_dir(params, serialization_dir, args.recover, args.force) prepare_global_logging(serialization_dir, args.file_friendly_logging) hyperparams = list( get_hyperparams(params.as_dict(infer_type_and_cast=True))) params.to_file(os.path.join(serialization_dir, CONFIG_NAME)) test_file = params.params.get("test_data_path", None) validation_data_path = params.get("validation_data_path", None) evaluate_on_test = params.pop_bool("evaluate_on_test", False) test_command = None if evaluate_on_test: test_command = BaseEvaluationCommand.from_params( params.pop("test_command")) cuda_device = params.params.get('trainer').get('cuda_device', -1) check_for_gpu(cuda_device) train_model = TrainPipelineModel.from_params( params=params, serialization_dir=serialization_dir, local_rank=0) trainer = train_model.trainer if trainer.validation_command is not None: trainer.validation_command.maybe_set_gold_file(validation_data_path) params.assert_empty('base train command') if args.comet is not None: experiment = Experiment(api_key=args.comet, workspace=args.workspace, project_name=args.project, parse_args=False, auto_output_logging=None) if args.tags: experiment.add_tags(args.tags) with open(args.param_path) as fil: code = "".join(fil.readlines()) code += "\n\n#=============Full details=============\n\n" full_details = _jsonnet.evaluate_file(args.param_path) code += full_details code += "\n\n#=============IMPORTANT: overwritten options============\n\n" code += args.overrides experiment.set_code(code, overwrite=True) for key, val in hyperparams: experiment.log_parameter(key, val) experiment.log_parameter("model_directory", serialization_dir) experiment.log_parameter("cuda_device", cuda_device) experiment.log_parameter("hostname", socket.gethostname()) experiment.log_parameter("random_seed", random_seed) experiment.log_parameter("numpy_seed", numpy_seed) experiment.log_parameter("pytorch_seed", pytorch_seed) else: experiment = None try: metrics = trainer.train(experiment) except KeyboardInterrupt: # if we have completed an epoch, try to create a model archive. if os.path.exists(os.path.join(serialization_dir, _DEFAULT_WEIGHTS)): logging.info( "Training interrupted by the user. Attempting to create " "a model archive using the current best epoch weights.") archive_model(serialization_dir) raise # Evaluate if test_file and evaluate_on_test: logger.info( "The model will be evaluated using the best epoch weights (see pred_test.txt)." ) trainer.annotator.annotate_file( trainer.model, test_file, os.path.join(serialization_dir, "pred_test.txt")) if test_command: logger.info("Comparing against gold standard.") test_command.maybe_set_gold_file(test_file) test_metrics = test_command.evaluate( os.path.join(serialization_dir, "pred_test.txt")) if experiment: with experiment.test(): experiment.log_metrics({ k: v for k, v in test_metrics.items() if np.isscalar(v) }) metrics = merge_dicts(metrics, "test", test_metrics) dump_metrics(os.path.join(serialization_dir, "metrics.json"), metrics, log=True) if not args.no_archive: # Now tar up results archive_model(serialization_dir)