def test_get_params(): config = _Config() config.seed = 1234 config.debug = True params = get_params(config, BASE_CONFIG) assert 'seed' in params assert 'debug' in params
def run(config, logger=None, local_rank=0, **kwargs): assert torch.cuda.is_available() assert torch.backends.cudnn.enabled, "Nvidia/Amp requires cudnn backend to be enabled." dist.init_process_group("nccl", init_method="env://") # As we passed config with option --manual_config_load assert hasattr(config, "setup"), "We need to manually setup the configuration, please set --manual_config_load " \ "to py_config_runner" config = config.setup() assert_config(config, INFERENCE_CONFIG) # The following attributes are automatically added by py_config_runner assert hasattr(config, "config_filepath") and isinstance( config.config_filepath, Path) assert hasattr(config, "script_filepath") and isinstance( config.script_filepath, Path) output_path = mlflow.get_artifact_uri() config.output_path = Path(output_path) if dist.get_rank() == 0: # dump python files to reproduce the run mlflow.log_artifact(config.config_filepath.as_posix()) mlflow.log_artifact(config.script_filepath.as_posix()) mlflow.log_params({ "pytorch version": torch.__version__, "ignite version": ignite.__version__, }) mlflow.log_params(get_params(config, INFERENCE_CONFIG)) mlflow.log_params({'mean': config.mean, 'std': config.std}) try: import os with_pbar_on_iters = True if "DISABLE_PBAR_ON_ITERS" in os.environ: with_pbar_on_iters = False inference(config, local_rank=local_rank, with_pbar_on_iters=with_pbar_on_iters) except KeyboardInterrupt: pass except Exception as e: if dist.get_rank() == 0: mlflow.log_param("Run Status", "FAILED") dist.destroy_process_group() raise e if dist.get_rank() == 0: mlflow.log_param("Run Status", "OK") dist.destroy_process_group()
def run(config, logger=None, local_rank=0, **kwargs): assert torch.cuda.is_available() assert (torch.backends.cudnn.enabled ), "Nvidia/Amp requires cudnn backend to be enabled." dist.init_process_group("nccl", init_method="env://") # As we passed config with option --manual_config_load assert hasattr(config, "setup"), ( "We need to manually setup the configuration, please set --manual_config_load " "to py_config_runner") config = config.setup() assert_config(config, TRAINVAL_CONFIG) # The following attributes are automatically added by py_config_runner assert hasattr(config, "config_filepath") and isinstance( config.config_filepath, Path) assert hasattr(config, "script_filepath") and isinstance( config.script_filepath, Path) # dump python files to reproduce the run mlflow.log_artifact(config.config_filepath.as_posix()) mlflow.log_artifact(config.script_filepath.as_posix()) output_path = mlflow.get_artifact_uri() config.output_path = Path(output_path) if dist.get_rank() == 0: mlflow.log_params({ "pytorch version": torch.__version__, "ignite version": ignite.__version__, }) mlflow.log_params(get_params(config, TRAINVAL_CONFIG)) try: training( config, local_rank=local_rank, with_mlflow_logging=True, with_plx_logging=False, ) except KeyboardInterrupt: logger.info("Catched KeyboardInterrupt -> exit") except Exception as e: # noqa logger.exception("") mlflow.log_param("Run Status", "FAILED") dist.destroy_process_group() raise e mlflow.log_param("Run Status", "OK") dist.destroy_process_group()
def run(config, **kwargs): """This is the main method to run the training. As this training script is launched with `py_config_runner` it should obligatory contain `run(config, **kwargs)` method. """ assert torch.cuda.is_available(), torch.cuda.is_available() assert torch.backends.cudnn.enabled, "Nvidia/Amp requires cudnn backend to be enabled." with idist.Parallel(backend="nccl") as parallel: logger = setup_logger(name="Pascal-VOC12 Training", distributed_rank=idist.get_rank()) assert_config(config, TRAINVAL_CONFIG) # The following attributes are automatically added by py_config_runner assert hasattr(config, "config_filepath") and isinstance( config.config_filepath, Path) assert hasattr(config, "script_filepath") and isinstance( config.script_filepath, Path) if idist.get_rank() == 0 and exp_tracking.has_clearml: try: from clearml import Task except ImportError: # Backwards-compatibility for legacy Trains SDK from trains import Task task = Task.init("Pascal-VOC12 Training", config.config_filepath.stem) task.connect_configuration(config.config_filepath.as_posix()) log_basic_info(logger, config) config.output_path = Path(exp_tracking.get_output_path()) # dump python files to reproduce the run exp_tracking.log_artifact(config.config_filepath.as_posix()) exp_tracking.log_artifact(config.script_filepath.as_posix()) exp_tracking.log_params(get_params(config, TRAINVAL_CONFIG)) try: parallel.run(training, config, logger=logger) except KeyboardInterrupt: logger.info("Catched KeyboardInterrupt -> exit") except Exception as e: # noqa logger.exception("") raise e
def run(config, logger=None, local_rank=0, **kwargs): assert torch.cuda.is_available(), torch.cuda.is_available() assert torch.backends.cudnn.enabled, "Nvidia/Amp requires cudnn backend to be enabled." dist.init_process_group("nccl", init_method="env://") # As we passed config with option --manual_config_load assert hasattr(config, "setup"), ( "We need to manually setup the configuration, please set --manual_config_load " "to py_config_runner") config = config.setup() assert_config(config, TRAINVAL_CONFIG) # The following attributes are automatically added by py_config_runner assert hasattr(config, "config_filepath") and isinstance( config.config_filepath, Path) assert hasattr(config, "script_filepath") and isinstance( config.script_filepath, Path) config.output_path = Path(get_outputs_path()) if dist.get_rank() == 0: plx_exp = Experiment() plx_exp.log_params( **{ "pytorch version": torch.__version__, "ignite version": ignite.__version__, }) plx_exp.log_params(**get_params(config, TRAINVAL_CONFIG)) try: training(config, local_rank=local_rank, with_mlflow_logging=False, with_plx_logging=True) except KeyboardInterrupt: logger.info("Catched KeyboardInterrupt -> exit") except Exception as e: # noqa logger.exception("") dist.destroy_process_group() raise e dist.destroy_process_group()
def run(config, logger=None, local_rank=0, **kwargs): assert torch.cuda.is_available() assert torch.backends.cudnn.enabled, "Nvidia/Amp requires cudnn backend to be enabled." task = Task.init( "ignite", "DeeplabV3_ResNet101 pascal_voc2012 segmentation example") dist.init_process_group("nccl", init_method="env://") # As we passed config with option --manual_config_load assert hasattr(config, "setup"), ( "We need to manually setup the configuration, please set --manual_config_load " "to py_config_runner") config = config.setup() assert_config(config, TRAINVAL_CONFIG) # The following attributes are automatically added by py_config_runner assert hasattr(config, "config_filepath") and isinstance( config.config_filepath, Path) assert hasattr(config, "script_filepath") and isinstance( config.script_filepath, Path) # dump python files to reproduce the run task.connect_configuration(config.config_filepath.as_posix()) task.upload_artifact("script", config.script_filepath) config.output_path = Path("./artifacts") # log the configuration, if we are the master node if dist.get_rank() == 0: task.connect(get_params(config, TRAINVAL_CONFIG)) try: training(config, local_rank=local_rank, with_trains_logging=True) except KeyboardInterrupt: logger.info("Caught KeyboardInterrupt -> exit") except Exception as e: # noqa logger.exception("") dist.destroy_process_group() raise e dist.destroy_process_group()
def test_get_train_params(): from py_config_runner.config_utils import TRAIN_CONFIG import torch.nn as nn import torch.optim as optim from torch.utils.data import DataLoader config = _Config() config.seed = 1234 config.debug = True config.device = "cpu" config.model = nn.Linear(1, 1) config.num_epochs = 1 config.criterion = nn.CrossEntropyLoss() config.optimizer = optim.SGD(config.model.parameters(), lr=0.1) data = [1, 2, 3, 4, 5] config.train_loader = DataLoader(data, batch_size=1) params = get_params(config, TRAIN_CONFIG) assert 'train loader' in params assert 'train loader batch size' in params
def run(config, **kwargs): """This is the main method to run the training. As this training script is launched with `py_config_runner` it should obligatory contain `run(config, **kwargs)` method. """ assert torch.cuda.is_available(), torch.cuda.is_available() assert torch.backends.cudnn.enabled, "Nvidia/Amp requires cudnn backend to be enabled." with idist.Parallel(backend="nccl") as parallel: logger = setup_logger(name="Satellite segmentation Training", distributed_rank=idist.get_rank()) assert_config(config, TRAINVAL_CONFIG) # The following attributes are automatically added by py_config_runner assert hasattr(config, "config_filepath") and isinstance( config.config_filepath, Path) assert hasattr(config, "script_filepath") and isinstance( config.script_filepath, Path) log_basic_info(logger, config) config.output_path = Path(tracking.get_output_path()) # dump python files to reproduce the run tracking.log_artifact(config.config_filepath.as_posix()) tracking.log_artifact(config.script_filepath.as_posix()) tracking.log_params(get_params(config, TRAINVAL_CONFIG)) try: parallel.run(training, config, logger=logger) except KeyboardInterrupt: logger.info("Catched KeyboardInterrupt -> exit") except Exception as e: # noqa logger.exception("") raise e