def cli_main(): pl.seed_everything(1234) # ------------ # args # ------------ parser = ArgumentParser() parser.add_argument("--batch_size", default=32, type=int) parser.add_argument("--hidden_dim", type=int, default=128) parser = pl.Trainer.add_argparse_args(parser) args = parser.parse_args() # ------------ # data # ------------ dataset = MNIST("", train=True, download=True, transform=transforms.ToTensor()) mnist_test = MNIST("", train=False, download=True, transform=transforms.ToTensor()) mnist_train, mnist_val = random_split(dataset, [55000, 5000]) train_loader = DataLoader(mnist_train, batch_size=args.batch_size) val_loader = DataLoader(mnist_val, batch_size=args.batch_size) test_loader = DataLoader(mnist_test, batch_size=args.batch_size) # ------------ # model # ------------ model = LitAutoEncoder() # ------------ # logging # ------------ # get azureml run object run = Run.get_context() # get the tracking uri for the azureml workspace mlflow_uri = run.experiment.workspace.get_mlflow_tracking_uri() # get the azureml experiment name exp_name = run.experiment.name mlf_logger = MLFlowLogger(experiment_name=exp_name, tracking_uri=mlflow_uri) # link the mlflowlogger run ID to the azureml run ID mlf_logger._run_id = run.id # ------------ # training # ------------ trainer = pl.Trainer.from_argparse_args(args, logger=mlf_logger) trainer.fit(model, train_loader, val_loader) # ------------ # testing # ------------ result = trainer.test(test_dataloaders=test_loader) print(result)
def create_logger() -> Union[bool, LightningLoggerBase]: """ Loosely imitate: https://github.com/Azure/azureml-examples/blob/main/tutorials/using-pytorch-lightning/3.log-with-mlflow.ipynb """ run = Run.get_context() if isinstance(run, _SubmittedRun): experiment = run.experiment tracking_uri = experiment.workspace.get_mlflow_tracking_uri() exp_name = run.experiment.name log.info( f"Using MLFlow logger with tracking URI {tracking_uri} and experiment name {exp_name}" ) rv = MLFlowLogger(exp_name, tracking_uri) rv._run_id = run.id else: log.warning("Unable to get AML run context! Logging locally.") rv = True return rv
def main(): """Main entry point of the program. Note: This main.py file is meant to be called using the cli, see the `examples/local/run.sh` file to see how to use it. """ parser = argparse.ArgumentParser() # __TODO__ check you need all the following CLI parameters parser.add_argument( '--config', help='config file with generic hyper-parameters, such as optimizer, ' 'batch_size, ... - in yaml format') parser.add_argument('--data', help='path to data', required=True) parser.add_argument('--data-module', default="hdf5", help="Data module to use. file or hdf5") parser.add_argument( '--tmp-folder', help= 'will use this folder as working folder - it will copy the input data ' 'here, generate results here, and then copy them back to the output ' 'folder') parser.add_argument('--output', help='path to outputs - will store files here', required=True) parser.add_argument( '--disable-progressbar', action='store_true', help='will disable the progressbar while going over the mini-batch') parser.add_argument( '--start-from-scratch', action='store_true', help='will not load any existing saved model - even if present') parser.add_argument('--debug', action='store_true') parser.add_argument("--embeddings-device", type=str, default="cuda", help="Which device to use for embeddings generation.") parser.add_argument( '--embeddings', action='store_true', help="Skip training and generate embeddings for evaluation.") parser.add_argument('--embeddings-ckpt', type=str, default=None, help="Checkpoint to load when generating embeddings.") parser.add_argument( "--dryrun", action="store_true", help= "Dry-run by training on the validtion set. Use only to test loop code." ) mlflow_save_dir = "./mlruns" # make into arg? tbx_save_dir = "./tensorboard" # make into arg? parser = pl.Trainer.add_argparse_args(parser) args = parser.parse_args() logging.basicConfig(stream=sys.stdout, level=logging.INFO) if not os.path.exists(args.output): os.makedirs(args.output) if args.tmp_folder is not None: data_folder_name = os.path.basename(os.path.normpath(args.data)) rsync_folder(args.data, args.tmp_folder) data_dir = os.path.join(args.tmp_folder, data_folder_name) output_dir = os.path.join(args.tmp_folder, 'output') if not os.path.exists(output_dir): os.makedirs(output_dir) else: data_dir = args.data output_dir = args.output # to intercept any print statement: sys.stdout = LoggerWriter(logger.info) sys.stderr = LoggerWriter(logger.warning) assert args.config is not None with open(args.config, 'r') as stream: hyper_params = load(stream, Loader=yaml.FullLoader) exp_name = hyper_params["exp_name"] output_dir = os.path.join(output_dir, exp_name) os.makedirs(output_dir, exist_ok=True) shutil.copyfile(args.config, os.path.join(output_dir, "config.backup")) assert "output_dir" not in hyper_params hyper_params["output_dir"] = output_dir os.makedirs(mlflow_save_dir, exist_ok=True) mlf_logger = MLFlowLogger( experiment_name=exp_name, save_dir=mlflow_save_dir, ) if os.path.exists(os.path.join(output_dir, STAT_FILE_NAME)): mlf_logger._run_id = load_mlflow(output_dir) logger.warning( f"WILL CONTINUE LOGGING IN MLFLOW RUN ID: {mlf_logger._run_id}") os.makedirs(tbx_save_dir, exist_ok=True) tbx_logger = TensorBoardLogger( save_dir=tbx_save_dir, name=exp_name, default_hp_metric=False, ) log_path = os.path.join(output_dir, "console.log") handler = logging.handlers.WatchedFileHandler(log_path) formatter = logging.Formatter(logging.BASIC_FORMAT) handler.setFormatter(formatter) root = logging.getLogger() root.setLevel(logging.INFO) root.addHandler(handler) mlflow.set_experiment(exp_name) mlflow.start_run(run_id=mlf_logger.run_id) run(args, data_dir, output_dir, hyper_params, mlf_logger, tbx_logger) mlflow.end_run() if args.tmp_folder is not None: rsync_folder(output_dir + os.path.sep, args.output)