def cli_main():
    pl.seed_everything(1234)

    # ------------
    # args
    # ------------
    parser = ArgumentParser()
    parser.add_argument("--batch_size", default=32, type=int)
    parser.add_argument("--hidden_dim", type=int, default=128)
    parser = pl.Trainer.add_argparse_args(parser)
    args = parser.parse_args()

    # ------------
    # data
    # ------------
    dataset = MNIST("",
                    train=True,
                    download=True,
                    transform=transforms.ToTensor())
    mnist_test = MNIST("",
                       train=False,
                       download=True,
                       transform=transforms.ToTensor())
    mnist_train, mnist_val = random_split(dataset, [55000, 5000])

    train_loader = DataLoader(mnist_train, batch_size=args.batch_size)
    val_loader = DataLoader(mnist_val, batch_size=args.batch_size)
    test_loader = DataLoader(mnist_test, batch_size=args.batch_size)

    # ------------
    # model
    # ------------
    model = LitAutoEncoder()

    # ------------
    # logging
    # ------------
    # get azureml run object
    run = Run.get_context()
    # get the tracking uri for the azureml workspace
    mlflow_uri = run.experiment.workspace.get_mlflow_tracking_uri()
    # get the azureml experiment name
    exp_name = run.experiment.name

    mlf_logger = MLFlowLogger(experiment_name=exp_name,
                              tracking_uri=mlflow_uri)
    # link the mlflowlogger run ID to the azureml run ID
    mlf_logger._run_id = run.id

    # ------------
    # training
    # ------------
    trainer = pl.Trainer.from_argparse_args(args, logger=mlf_logger)
    trainer.fit(model, train_loader, val_loader)

    # ------------
    # testing
    # ------------
    result = trainer.test(test_dataloaders=test_loader)
    print(result)
示例#2
0
文件: train.py 项目: dkmiller/tidbits
def create_logger() -> Union[bool, LightningLoggerBase]:
    """
    Loosely imitate:
    https://github.com/Azure/azureml-examples/blob/main/tutorials/using-pytorch-lightning/3.log-with-mlflow.ipynb
    """
    run = Run.get_context()
    if isinstance(run, _SubmittedRun):
        experiment = run.experiment
        tracking_uri = experiment.workspace.get_mlflow_tracking_uri()
        exp_name = run.experiment.name
        log.info(
            f"Using MLFlow logger with tracking URI {tracking_uri} and experiment name {exp_name}"
        )
        rv = MLFlowLogger(exp_name, tracking_uri)
        rv._run_id = run.id
    else:
        log.warning("Unable to get AML run context! Logging locally.")
        rv = True

    return rv
示例#3
0
def main():
    """Main entry point of the program.

    Note:
        This main.py file is meant to be called using the cli,
        see the `examples/local/run.sh` file to see how to use it.

    """
    parser = argparse.ArgumentParser()
    # __TODO__ check you need all the following CLI parameters
    parser.add_argument(
        '--config',
        help='config file with generic hyper-parameters,  such as optimizer, '
        'batch_size, ... -  in yaml format')
    parser.add_argument('--data', help='path to data', required=True)
    parser.add_argument('--data-module',
                        default="hdf5",
                        help="Data module to use. file or hdf5")
    parser.add_argument(
        '--tmp-folder',
        help=
        'will use this folder as working folder - it will copy the input data '
        'here, generate results here, and then copy them back to the output '
        'folder')
    parser.add_argument('--output',
                        help='path to outputs - will store files here',
                        required=True)
    parser.add_argument(
        '--disable-progressbar',
        action='store_true',
        help='will disable the progressbar while going over the mini-batch')
    parser.add_argument(
        '--start-from-scratch',
        action='store_true',
        help='will not load any existing saved model - even if present')
    parser.add_argument('--debug', action='store_true')
    parser.add_argument("--embeddings-device",
                        type=str,
                        default="cuda",
                        help="Which device to use for embeddings generation.")
    parser.add_argument(
        '--embeddings',
        action='store_true',
        help="Skip training and generate embeddings for evaluation.")
    parser.add_argument('--embeddings-ckpt',
                        type=str,
                        default=None,
                        help="Checkpoint to load when generating embeddings.")
    parser.add_argument(
        "--dryrun",
        action="store_true",
        help=
        "Dry-run by training on the validtion set. Use only to test loop code."
    )

    mlflow_save_dir = "./mlruns"  # make into arg?
    tbx_save_dir = "./tensorboard"  # make into arg?

    parser = pl.Trainer.add_argparse_args(parser)

    args = parser.parse_args()

    logging.basicConfig(stream=sys.stdout, level=logging.INFO)

    if not os.path.exists(args.output):
        os.makedirs(args.output)

    if args.tmp_folder is not None:
        data_folder_name = os.path.basename(os.path.normpath(args.data))
        rsync_folder(args.data, args.tmp_folder)
        data_dir = os.path.join(args.tmp_folder, data_folder_name)
        output_dir = os.path.join(args.tmp_folder, 'output')
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)
    else:
        data_dir = args.data
        output_dir = args.output

    # to intercept any print statement:
    sys.stdout = LoggerWriter(logger.info)
    sys.stderr = LoggerWriter(logger.warning)

    assert args.config is not None
    with open(args.config, 'r') as stream:
        hyper_params = load(stream, Loader=yaml.FullLoader)
    exp_name = hyper_params["exp_name"]
    output_dir = os.path.join(output_dir, exp_name)
    os.makedirs(output_dir, exist_ok=True)
    shutil.copyfile(args.config, os.path.join(output_dir, "config.backup"))
    assert "output_dir" not in hyper_params
    hyper_params["output_dir"] = output_dir
    os.makedirs(mlflow_save_dir, exist_ok=True)
    mlf_logger = MLFlowLogger(
        experiment_name=exp_name,
        save_dir=mlflow_save_dir,
    )
    if os.path.exists(os.path.join(output_dir, STAT_FILE_NAME)):
        mlf_logger._run_id = load_mlflow(output_dir)
        logger.warning(
            f"WILL CONTINUE LOGGING IN MLFLOW RUN ID: {mlf_logger._run_id}")
    os.makedirs(tbx_save_dir, exist_ok=True)
    tbx_logger = TensorBoardLogger(
        save_dir=tbx_save_dir,
        name=exp_name,
        default_hp_metric=False,
    )

    log_path = os.path.join(output_dir, "console.log")
    handler = logging.handlers.WatchedFileHandler(log_path)
    formatter = logging.Formatter(logging.BASIC_FORMAT)
    handler.setFormatter(formatter)
    root = logging.getLogger()
    root.setLevel(logging.INFO)
    root.addHandler(handler)

    mlflow.set_experiment(exp_name)
    mlflow.start_run(run_id=mlf_logger.run_id)
    run(args, data_dir, output_dir, hyper_params, mlf_logger, tbx_logger)
    mlflow.end_run()
    if args.tmp_folder is not None:
        rsync_folder(output_dir + os.path.sep, args.output)