def run( seed=543, data_path="/tmp/cifar10", output_path="/tmp/output-cifar10/", model="resnet18_QAT_8b", batch_size=512, momentum=0.9, weight_decay=1e-4, num_workers=12, num_epochs=24, learning_rate=0.4, num_warmup_epochs=4, validate_every=3, checkpoint_every=1000, backend=None, resume_from=None, log_every_iters=15, nproc_per_node=None, with_clearml=False, with_amp=False, **spawn_kwargs, ): """Main entry to train an model on CIFAR10 dataset. Args: seed (int): random state seed to set. Default, 543. data_path (str): input dataset path. Default, "/tmp/cifar10". output_path (str): output path. Default, "/tmp/output-cifar10". model (str): model name (from torchvision) to setup model to train. Default, "resnet18". batch_size (int): total batch size. Default, 512. momentum (float): optimizer's momentum. Default, 0.9. weight_decay (float): weight decay. Default, 1e-4. num_workers (int): number of workers in the data loader. Default, 12. num_epochs (int): number of epochs to train the model. Default, 24. learning_rate (float): peak of piecewise linear learning rate scheduler. Default, 0.4. num_warmup_epochs (int): number of warm-up epochs before learning rate decay. Default, 4. validate_every (int): run model's validation every ``validate_every`` epochs. Default, 3. checkpoint_every (int): store training checkpoint every ``checkpoint_every`` iterations. Default, 200. backend (str, optional): backend to use for distributed configuration. Possible values: None, "nccl", "xla-tpu", "gloo" etc. Default, None. nproc_per_node (int, optional): optional argument to setup number of processes per node. It is useful, when main python process is spawning training as child processes. resume_from (str, optional): path to checkpoint to use to resume the training from. Default, None. log_every_iters (int): argument to log batch loss every ``log_every_iters`` iterations. It can be 0 to disable it. Default, 15. with_clearml (bool): if True, experiment ClearML logger is setup. Default, False. with_amp (bool): if True, enables native automatic mixed precision. Default, False. **spawn_kwargs: Other kwargs to spawn run in child processes: master_addr, master_port, node_rank, nnodes """ # catch all local parameters config = locals() config.update(config["spawn_kwargs"]) del config["spawn_kwargs"] spawn_kwargs["nproc_per_node"] = nproc_per_node with idist.Parallel(backend=backend, **spawn_kwargs) as parallel: parallel.run(training, config)
def run_evaluation(config_filepath, backend="nccl", with_clearml=True): """Main entry to run model's evaluation: - compute validation metrics Args: config_filepath (str): evaluation configuration .py file backend (str): distributed backend: nccl, gloo, horovod or None to run without distributed config with_clearml (bool): if True, uses ClearML as experiment tracking system """ assert torch.cuda.is_available(), torch.cuda.is_available() assert torch.backends.cudnn.enabled torch.backends.cudnn.benchmark = True config_filepath = Path(config_filepath) assert config_filepath.exists(), f"File '{config_filepath.as_posix()}' is not found" with idist.Parallel(backend=backend) as parallel: logger = setup_logger(name="Pascal-VOC12 Evaluation", distributed_rank=idist.get_rank()) config = ConfigObject(config_filepath) InferenceConfigSchema.validate(config) config.script_filepath = Path(__file__) output_path = setup_experiment_tracking(config, with_clearml=with_clearml, task_type="testing") config.output_path = output_path utils.log_basic_info(logger, get_params(config, InferenceConfigSchema)) try: parallel.run(evaluation, config, logger=logger, with_clearml=with_clearml) except KeyboardInterrupt: logger.info("Catched KeyboardInterrupt -> exit") except Exception as e: # noqa logger.exception("") raise e
def test_idist_parallel_no_dist(): device = torch.device("cuda" if torch.cuda.is_available() else "cpu") with idist.Parallel(backend=None) as parallel: parallel.run(_test_func, ws=1, device=device, backend=None, true_init_method=None)
def test_idist_parallel_n_procs_native(init_method, backend, get_fixed_dirname, local_rank, world_size): if init_method == "FILE": init_method = f"file://{get_fixed_dirname('idist_parallel_n_procs_native')}/shared" os.environ["RANK"] = str(local_rank) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") with idist.Parallel(backend=backend, init_method=init_method) as parallel: parallel.run(_test_func, ws=world_size, device=device, backend=backend, true_init_method=init_method)
def test_idist_parallel_spawn_n_procs_native(init_method, backend, dirname): if init_method == "FILE": init_method = f"file://{dirname}/shared" nproc_per_node = torch.cuda.device_count() if torch.cuda.is_available() else 4 device = torch.device("cuda" if torch.cuda.is_available() else "cpu") with idist.Parallel(backend=backend, nproc_per_node=nproc_per_node, init_method=init_method) as parallel: parallel.run(_test_func, ws=nproc_per_node, device=device, backend=backend, true_init_method=init_method)
def test_idist_parallel_spawn_n_procs_native(init_method, backend, dirname): if init_method == "FILE": init_method = f"file://{dirname}/shared" nproc_per_node = 4 if "gloo" == backend else torch.cuda.device_count() device = "cpu" if "gloo" == backend else "cuda" with idist.Parallel(backend=backend, nproc_per_node=nproc_per_node, init_method=init_method) as parallel: parallel.run(_test_func, ws=nproc_per_node, device=device, backend=backend, true_init_method=init_method)
def test_parallel_wrong_inputs(): with pytest.raises(ValueError, match=r"Unknown backend 'abc'. Available backends:"): idist.Parallel(backend="abc") with pytest.raises( ValueError, match=r"If backend is None, argument 'nnodes' should be also None" ): idist.Parallel(nnodes=2) with pytest.raises(ValueError, match=r"Argument nproc_per_node should positive"): idist.Parallel(backend="gloo", nproc_per_node=-1) with pytest.raises(ValueError, match=r"Argument nnodes should positive"): idist.Parallel(backend="gloo", nproc_per_node=1, nnodes=-1) with pytest.raises(ValueError, match=r"If number of nodes larger than one"): idist.Parallel(backend="gloo", nproc_per_node=1, nnodes=2) with pytest.raises(ValueError, match=r"Argument node_rank should be between 0 and"): idist.Parallel(backend="gloo", nproc_per_node=1, nnodes=2, node_rank=2) with pytest.raises( ValueError, match= r"If number of nodes larger than one, arguments master_addr and master_port" ): idist.Parallel(backend="gloo", nproc_per_node=1, nnodes=2, node_rank=1)
def test_idist_parallel_spawn_params_xla(): res = idist.Parallel._setup_spawn_params( nproc_per_node=8, nnodes=None, node_rank=None, master_addr=None, master_port=None, start_method="fork" ) assert "nproc_per_node" in res and res["nproc_per_node"] == 8 assert "start_method" in res and res["start_method"] == "fork" with idist.Parallel(backend="xla-tpu", nproc_per_node=8, start_method="fork") as parallel: assert parallel.backend == "xla-tpu" res = parallel._spawn_params assert "nproc_per_node" in res and res["nproc_per_node"] == 8 assert "start_method" in res and res["start_method"] == "fork"
def run(config, **kwargs): """This is the main method to run the training. As this training script is launched with `py_config_runner` it should obligatory contain `run(config, **kwargs)` method. """ assert torch.cuda.is_available(), torch.cuda.is_available() assert torch.backends.cudnn.enabled, "Nvidia/Amp requires cudnn backend to be enabled." with idist.Parallel(backend="nccl") as parallel: logger = setup_logger(name="Pascal-VOC12 Training", distributed_rank=idist.get_rank()) assert_config(config, TRAINVAL_CONFIG) # The following attributes are automatically added by py_config_runner assert hasattr(config, "config_filepath") and isinstance( config.config_filepath, Path) assert hasattr(config, "script_filepath") and isinstance( config.script_filepath, Path) if idist.get_rank() == 0 and exp_tracking.has_clearml: try: from clearml import Task except ImportError: # Backwards-compatibility for legacy Trains SDK from trains import Task task = Task.init("Pascal-VOC12 Training", config.config_filepath.stem) task.connect_configuration(config.config_filepath.as_posix()) log_basic_info(logger, config) config.output_path = Path(exp_tracking.get_output_path()) # dump python files to reproduce the run exp_tracking.log_artifact(config.config_filepath.as_posix()) exp_tracking.log_artifact(config.script_filepath.as_posix()) exp_tracking.log_params(get_params(config, TRAINVAL_CONFIG)) try: parallel.run(training, config, logger=logger) except KeyboardInterrupt: logger.info("Catched KeyboardInterrupt -> exit") except Exception as e: # noqa logger.exception("") raise e
def main(): parser = ArgumentParser(parents=[get_default_parser()]) config = parser.parse_args() with idist.Parallel( backend=config.backend, {% if use_distributed_training and not use_distributed_launcher %} nproc_per_node=config.nproc_per_node, {% if nnodes > 1 and not use_distributed_launcher%} node_rank=config.node_rank, nnodes=config.nnodes, master_addr=config.master_addr, master_port=config.master_port, {% endif %} {% endif %} ) as parallel: parallel.run(run, config=config)
def run(config, **kwargs): """This is the main method to run the training. As this training script is launched with `py_config_runner` it should obligatory contain `run(config, **kwargs)` method. """ assert torch.cuda.is_available(), torch.cuda.is_available() assert torch.backends.cudnn.enabled, "Nvidia/Amp requires cudnn backend to be enabled." with idist.Parallel(backend="nccl") as parallel: logger = setup_logger(name="Satellite segmentation Training", distributed_rank=idist.get_rank()) assert_config(config, TRAINVAL_CONFIG) # The following attributes are automatically added by py_config_runner assert hasattr(config, "config_filepath") and isinstance( config.config_filepath, Path) assert hasattr(config, "script_filepath") and isinstance( config.script_filepath, Path) log_basic_info(logger, config) config.output_path = Path(tracking.get_output_path()) # dump python files to reproduce the run tracking.log_artifact(config.config_filepath.as_posix()) tracking.log_artifact(config.script_filepath.as_posix()) tracking.log_params(get_params(config, TRAINVAL_CONFIG)) try: parallel.run(training, config, logger=logger) except KeyboardInterrupt: logger.info("Catched KeyboardInterrupt -> exit") except Exception as e: # noqa logger.exception("") raise e
def main(cfg: DictConfig) -> None: with idist.Parallel( backend=cfg.distributed.backend, nproc_per_node=cfg.distributed.nproc_per_node ) as parallel: parallel.run(training, cfg)
def main(hyperparams): with idist.Parallel(**hyperparams.dist_params) as parallel: parallel.run(run, hyperparams)
def main( experiment_name: str, gpus: Union[str, List[str], str] = "auto", nproc_per_node: Union[int, str] = "auto", dataset_root: str = "./dataset", log_dir: str = "./log", model: str = "fasterrcnn_resnet50_fpn", epochs: int = 13, batch_size: int = 4, lr: int = 0.01, download: bool = False, image_size: int = 256, resume_from: str = None, ) -> None: """ Args: experiment_name: the name of each run dataset_root: dataset root directory for VOC2012 Dataset gpus: can be "auto", "none" or number of gpu device ids like "0,1" log_dir: where to put all the logs epochs: number of epochs to train model: model to use, possible options are "fasterrcnn_resnet50_fpn", "fasterrcnn_mobilenet_v3_large_fpn", "fasterrcnn_mobilenet_v3_large_320_fpn" batch_size: batch size lr: initial learning rate download: whether to automatically download dataset device: either cuda or cpu image_size: image size for training and validation resume_from: path of checkpoint to resume from """ if model not in AVAILABLE_MODELS: raise RuntimeError(f"Invalid model name: {model}") if isinstance(gpus, int): gpus = (gpus, ) if isinstance(gpus, tuple): os.environ["CUDA_VISIBLE_DEVICES"] = ",".join( [str(gpu) for gpu in gpus]) elif gpus == "auto": gpus = tuple(range(torch.cuda.device_count())) elif gpus == "none": os.environ["CUDA_VISIBLE_DEVICES"] = "" gpus = tuple() ngpu = len(gpus) backend = "nccl" if ngpu > 0 else "gloo" if nproc_per_node == "auto": nproc_per_node = ngpu if ngpu > 0 else max( multiprocessing.cpu_count() // 2, 1) # to precent multiple download for preatrined checkpoint, create model in the main process model = getattr(detection, model)(pretrained=True) if model.__class__.__name__ == "FasterRCNN": in_features = model.roi_heads.box_predictor.cls_score.in_features model.roi_heads.box_predictor = FastRCNNPredictor(in_features, 21) elif model.__class__.__name__ == "RetinaNet": head = RetinaNetClassificationHead( model.backbone.out_channels, model.anchor_generator.num_anchors_per_location()[0], num_classes=21) model.head.classification_head = head with idist.Parallel(backend=backend, nproc_per_node=nproc_per_node) as parallel: parallel.run( run, "cuda" if ngpu > 0 else "cpu", experiment_name, gpus, dataset_root, log_dir, model, epochs, batch_size, lr, download, image_size, resume_from, )
def run( seed=543, data_dir="/tmp/data", output_dir="/tmp/output-imdb/", model="bert-base-uncased", model_dir="/tmp/model", tokenizer_dir="/tmp/tokenizer", num_classes=1, dropout=0.3, n_fc=768, max_length=256, batch_size=128, weight_decay=0.01, num_workers=4, num_epochs=3, learning_rate=5e-5, num_warmup_epochs=0, validate_every=1, checkpoint_every=1000, backend=None, resume_from=None, log_every_iters=15, nproc_per_node=None, with_clearml=False, with_amp=False, **spawn_kwargs, ): """Main entry to fintune a transformer model on the IMDB dataset for sentiment classification. Args: seed (int): random state seed to set. Default, 543. data_dir (str): dataset cache directory. Default, "/tmp/data". output_path (str): output path. Default, "/tmp/output-IMDB". model (str): model name (from transformers) to setup model,tokenize and config to train. Default, "bert-base-uncased". model_dir (str): cache directory to download the pretrained model. Default, "/tmp/model". tokenizer_dir (str) : tokenizer cache directory. Default, "/tmp/tokenizer". num_classes (int) : number of target classes. Default, 1 (binary classification). dropout (float) : dropout probability. Default, 0.3. n_fc (int) : number of neurons in the last fully connected layer. Default, 768. max_length (int) : maximum number of tokens for the inputs to the transformer model. Default,256 batch_size (int): total batch size. Default, 128 . weight_decay (float): weight decay. Default, 0.01 . num_workers (int): number of workers in the data loader. Default, 12. num_epochs (int): number of epochs to train the model. Default, 5. learning_rate (float): peak of piecewise linear learning rate scheduler. Default, 5e-5. num_warmup_epochs (int): number of warm-up epochs before learning rate decay. Default, 3. validate_every (int): run model's validation every ``validate_every`` epochs. Default, 3. checkpoint_every (int): store training checkpoint every ``checkpoint_every`` iterations. Default, 1000. backend (str, optional): backend to use for distributed configuration. Possible values: None, "nccl", "xla-tpu", "gloo" etc. Default, None. nproc_per_node (int, optional): optional argument to setup number of processes per node. It is useful, when main python process is spawning training as child processes. resume_from (str, optional): path to checkpoint to use to resume the training from. Default, None. log_every_iters (int): argument to log batch loss every ``log_every_iters`` iterations. It can be 0 to disable it. Default, 15. with_clearml (bool): if True, experiment ClearML logger is setup. Default, False. with_amp (bool): if True, enables native automatic mixed precision. Default, False. **spawn_kwargs: Other kwargs to spawn run in child processes: master_addr, master_port, node_rank, nnodes """ # catch all local parameters config = locals() config.update(config["spawn_kwargs"]) del config["spawn_kwargs"] spawn_kwargs["nproc_per_node"] = nproc_per_node with idist.Parallel(backend=backend, **spawn_kwargs) as parallel: parallel.run(training, config)
def run( seed=42, data_path="./data", subset_train="train", subset_val="val", output_path="./output", architecture="FPN", encoder="resnet50", encoder_weights="imagenet", encoder_freeze_at=None, batch_size=6, optimizer="Adam", weight_decay=1e-4, num_workers=12, num_iterations=10000, learning_rate=0.0001, learning_rate_milestone_iterations=(2000, 8000), gamma=0.1, num_warmup_iterations=1000, warmup_factor=0.001, validate_every=10, checkpoint_every=200, backend=None, resume_from=None, log_every_iters=0, nproc_per_node=None, stop_iteration=None, with_trains=False, active_gpu_ids=(0,), **spawn_kwargs, ): """Main entry to train a model on the semantic segmentation of carbon black agglomerate TEM images. Args: seed (int): random state seed to set. Default, 42. data_path (str): input dataset path. Default, "./data". subset_train (str): name of training subset. Default, "train". subset_val (str): name of validation subset. Default, "val". architecture (str): architecture (see https://github.com/qubvel/segmentation_models.pytorch#architectures-). Default, "FPN". encoder (str): encoder architecture (see https://github.com/qubvel/segmentation_models.pytorch#encoders-). Default, "resnet50". encoder_weights (str): pretrained weights (see https://github.com/qubvel/segmentation_models.pytorch#encoders-). Default, "imagenet". encoder_freeze_at (int or None): defines stages of the encoder which are frozen before the training (e.g. 2 means all stages including stage 2 and beyond). Default, None. output_path (str): output path. Default, "./output". batch_size (int): total batch size. Default, 6. optimizer (str): optimizer. Default, "Adam". weight_decay (float): weight decay. Default, 1e-4. num_workers (int): number of workers in the data loader. Default, 12. num_iterations (int): number of iterations to train the model. Default, 10000. learning_rate (float): peak of piecewise linear learning rate scheduler. Default, 0.0001. learning_rate_milestone_iterations (iterable of int): numbers of iterations where learning rate is each time decreased by a factor gamma. Default, (2000, 8000). gamma (float): factor to multiply learning rate with at each milestone. Default, 0.1. num_warmup_iterations (int): number of warm-up iterations before learning rate decay. Default, 1000. warmup_factor (float): learning rate starts at warmup_factor * learning_rate. Default, 0.001. validate_every (int): run model's validation every ``validate_every`` epochs. Default, 10. checkpoint_every (int): store training checkpoint every ``checkpoint_every`` iterations. Default, 200. backend (str, optional): backend to use for distributed configuration. Possible values: None, "nccl", "xla-tpu", "gloo" etc. Default, None. nproc_per_node (int, optional): optional argument to setup number of processes per node. It is useful, when main python process is spawning training as child processes. Default, None. resume_from (str, optional): path to checkpoint to use to resume the training from. Default, None. log_every_iters (int): argument to log batch loss every ``log_every_iters`` iterations. It can be 0 to disable it. Default, 0. stop_iteration (int, optional): iteration to stop the training. Can be used to check resume from checkpoint. Default, None. with_trains (bool): if True, experiment Trains logger is setup. Default, False. active_gpu_ids (tuple of int): ids of GPUs to use. Default, (0,). **spawn_kwargs: Other kwargs to spawn run in child processes: master_addr, master_port, node_rank, nnodes """ # catch all local parameters config = locals() config.update(config["spawn_kwargs"]) del config["spawn_kwargs"] utils.select_active_gpus(config["active_gpu_ids"]) spawn_kwargs["nproc_per_node"] = nproc_per_node with idist.Parallel(backend=backend, **spawn_kwargs) as parallel: parallel.run(training, config)
def run(config: ConfigSchema) -> None: spawn_kwargs = config.spawn_kwargs spawn_kwargs["nproc_per_node"] = config.nproc_per_node with idist.Parallel(backend=config.backend, **spawn_kwargs) as parallel: parallel.run(run_training, config)
trainer.state.output, )) trainer.run(train_loader, max_epochs=1) if __name__ == "__main__": parser = argparse.ArgumentParser("Pytorch Ignite - idist") parser.add_argument("--backend", type=str, default="nccl") parser.add_argument("--nproc_per_node", type=int) parser.add_argument("--log_interval", type=int, default=4) parser.add_argument("--nb_samples", type=int, default=128) parser.add_argument("--batch_size", type=int, default=16) args_parsed = parser.parse_args() # idist from ignite handles multiple backend (gloo, nccl, horovod, xla) # and launcher (torch.distributed.launch, horovodrun, slurm) config = { "log_interval": args_parsed.log_interval, "batch_size": args_parsed.batch_size, "nb_samples": args_parsed.nb_samples, } spawn_kwargs = dict() spawn_kwargs["nproc_per_node"] = args_parsed.nproc_per_node # Specific ignite.distributed with idist.Parallel(backend=args_parsed.backend, **spawn_kwargs) as parallel: parallel.run(training, config)
def run( seed=543, data_path="/tmp/cifar10", output_path="/tmp/output-cifar10/", model="vit_tiny_patch4_32x32", rescale_size=None, rand_aug=None, rand_erasing=None, optimizer="adam", batch_size=128, weight_decay=1e-4, num_workers=4, num_epochs=200, learning_rate=0.001, num_warmup_epochs=0, validate_every=3, checkpoint_every=1000, backend=None, resume_from=None, nproc_per_node=None, with_pbar=False, with_amp=False, cutmix_beta=0.0, cutmix_prob=0.5, rescaled_size=None, with_clearml=False, smoke_test=False, **spawn_kwargs, ): """Main entry to train an model on CIFAR10 dataset. Args: seed (int): random state seed to set. data_path (str): input dataset path. Default, "/tmp/cifar10". output_path (str): output path. Default, "/tmp/output-cifar10". model (str): model name (from torchvision) to setup model to train. batch_size (int): total batch size. optimizer (str): optimizer name. Possible values: "sgd", "adam", "adamw". Default, "adam". weight_decay (float): weight decay. num_workers (int): number of workers in the data loader. num_epochs (int): number of epochs to train the model. learning_rate (float): peak of piecewise linear learning rate scheduler. num_warmup_epochs (int): number of warm-up epochs before learning rate decay. validate_every (int): run model's validation every ``validate_every`` epochs. checkpoint_every (int): store training checkpoint every ``checkpoint_every`` iterations. backend (str, optional): backend to use for distributed configuration. Possible values: None, "nccl", "gloo" etc. nproc_per_node (int, optional): optional argument to setup number of processes per node. It is useful, when main python process is spawning training as child processes. resume_from (str, optional): path to checkpoint to use to resume the training from. with_pbar(bool): if True adds a progress bar on training iterations. with_amp(bool): if True uses torch native AMP rescale_size (int, optional): if provided then input image will be rescaled to that value. cutmix_beta : beta value for the distribution of the cutmix cutmix_prob : cutmix probablity with_clearml (bool): if True, experiment ClearML logger is setup. smoke_test (bool): run 5 iters and quit **spawn_kwargs: Other kwargs to spawn run in child processes: master_addr, master_port, node_rank, nnodes """ # catch all local parameters config = locals() config.update(config["spawn_kwargs"]) del config["spawn_kwargs"] spawn_kwargs["nproc_per_node"] = nproc_per_node if backend == "xla-tpu" and with_amp: raise RuntimeError( "The value of with_amp should be False if backend is xla") with idist.Parallel(backend=backend, **spawn_kwargs) as parallel: parallel.run(training, config)
def test_idist_parallel_gloo(): with idist.Parallel(backend="gloo", nproc_per_node=4) as parallel: parallel.run(_test_func, ws=4, device="cpu")
def test_idist_parallel_nccl(): with idist.Parallel(backend="nccl", nproc_per_node=torch.cuda.device_count()) as parallel: parallel.run(_test_func, ws=torch.cuda.device_count(), device="cuda")
def test_idist_parallel_nccl_nprocs(local_rank, world_size): os.environ["RANK"] = str(local_rank) with idist.Parallel(backend="nccl") as parallel: parallel.run(_test_func, ws=world_size, device="cuda")
parser.add_argument("--backend", type=str, default=None) parser.add_argument("--nproc_per_node", type=int, default=None) parser.add_argument("--nnodes", type=int, default=None) parser.add_argument("--node_rank", type=int, default=None) parser.add_argument("--master_addr", type=str, default=None) parser.add_argument("--master_port", type=str, default=None) parser.add_argument("--init_method", type=str, default=None) args = parser.parse_args() config = { "model": "resnet18", "lr": 0.01, } if args.backend in ["gloo", "nccl"]: config[ "true_init_method"] = args.init_method if args.init_method is not None else "env://" dist_config = dict( nproc_per_node=args.nproc_per_node, nnodes=args.nnodes, node_rank=args.node_rank, master_addr=args.master_addr, master_port=args.master_port, ) if args.init_method is not None: dist_config["init_method"] = args.init_method with idist.Parallel(backend=args.backend, **dist_config) as parallel: parallel.run(training, config, a=1, b=2)
def test_idist_parallel_no_dist(): device = "cuda" if torch.cuda.is_available() else "cpu" with idist.Parallel(backend=None) as parallel: parallel.run(_test_func, ws=1, device=device)
# Simply run everything on your infrastructure # --- Single computation device --- # $ python main.py # if __name__ == "__main__" and not (in_colab or with_torch_launch): backend = None # or "nccl", "gloo", "xla-tpu" ... nproc_per_node = None # or N to spawn N processes config = { "model": "resnet18", "dataset": "cifar10", } with idist.Parallel(backend=backend, nproc_per_node=nproc_per_node) as parallel: parallel.run(training, config) # --- Multiple GPUs --- # $ python -m torch.distributed.launch --nproc_per_node=2 --use_env main.py # if __name__ == "__main__" and with_torch_launch: backend = "nccl" # or "nccl", "gloo", "xla-tpu" ... nproc_per_node = None # or N to spawn N processes config = { "model": "resnet18", "dataset": "cifar10", }
help="Override train batch size") parser.add_argument("--lr", type=float, default=None, help="Override train learning rate") parser.add_argument("--ep", type=int, default=None, help="Override number of epochs") args = parser.parse_args() assert args.config is not None assert args.config.exists() # Define configuration mutations if certain cmd args are defined mutations = {} if args.bs is not None: mutations["train_batch_size"] = args.bs if args.lr is not None: mutations["learning_rate"] = args.lr if args.ep is not None: mutations["num_epochs"] = args.ep # Pass configuration file into py_config_runner.ConfigObject # and fetch configuration parameters as attributes config = ConfigObject(args.config, mutations=mutations) with idist.Parallel(backend=args.backend, nproc_per_node=args.nproc_per_node) as parallel: parallel.run(training, config)