Пример #1
0
def main(args):
    config = load_config(args.config)
    args.out = os.path.expanduser(args.out)
    config["model"][
        "loader"] = args.loader if args.loader else config["model"]["loader"]
    config["model"]["bs"] = args.bs if args.bs else config["model"]["bs"]
    config["model"]["lr"] = args.lr if args.lr else config["model"]["lr"]
    config["model"]["ts"] = tuple(map(
        int, args.ts.split(","))) if args.ts else config["model"]["ts"]
    config["model"]["nn"] = args.nn if args.nn else config["model"]["nn"]
    config["model"]["encoder"] = args.encoder if args.encoder else config[
        "model"]["encoder"]
    config["model"][
        "loss"] = args.loss if args.loss else config["model"]["loss"]
    config["model"]["da"] = args.da if args.da else config["model"]["da"]
    config["model"]["dap"] = args.dap if args.dap else config["model"]["dap"]
    args.workers = config["model"]["bs"] if not args.workers else args.workers
    check_classes(config)
    check_channels(config)
    check_model(config)

    assert os.path.isdir(os.path.expanduser(
        args.dataset)), "Dataset is not a directory"
    if args.no_training and args.no_validation:
        sys.exit()

    log = Logs(os.path.join(args.out, "log"))
    csv_train = None if args.no_training else open(
        os.path.join(args.out, "training.csv"), mode="a")
    csv_val = None if args.no_validation else open(
        os.path.join(args.out, "validation.csv"), mode="a")

    if torch.cuda.is_available():
        log.log("RoboSat.pink - training on {} GPUs, with {} workers".format(
            torch.cuda.device_count(), args.workers))
        log.log("(Torch:{} Cuda:{} CudNN:{})".format(
            torch.__version__, torch.version.cuda,
            torch.backends.cudnn.version()))
        device = torch.device("cuda")
        torch.backends.cudnn.benchmark = True
    else:
        log.log("RoboSat.pink - training on CPU, with {} workers - (Torch:{})".
                format(args.workers, torch.__version__))
        log.log("")
        log.log("==========================================================")
        log.log("WARNING: Are you -really- sure about not training on GPU ?")
        log.log("==========================================================")
        log.log("")
        device = torch.device("cpu")

    log.log("--- Input tensor from Dataset: {} ---".format(args.dataset))
    num_channel = 1  # 1-based numerotation
    for channel in config["channels"]:
        for band in channel["bands"]:
            log.log("Channel {}:\t\t {}[band: {}]".format(
                num_channel, channel["name"], band))
            num_channel += 1

    log.log("--- Output Classes ---")
    for c, classe in enumerate(config["classes"]):
        log.log("Class {}:\t\t {}".format(c, classe["title"]))

    log.log("--- Hyper Parameters ---")
    for hp in config["model"]:
        log.log("{}{}".format(hp.ljust(25, " "), config["model"][hp]))

    loader = load_module("robosat_pink.loaders.{}".format(
        config["model"]["loader"].lower()))
    loader_train = getattr(loader, config["model"]["loader"])(
        config, config["model"]["ts"], os.path.join(args.dataset,
                                                    "training"), None, "train")
    loader_val = getattr(loader, config["model"]["loader"])(
        config, config["model"]["ts"],
        os.path.join(args.dataset, "validation"), None, "train")

    encoder = config["model"]["encoder"].lower()
    nn_module = load_module("robosat_pink.nn.{}".format(
        config["model"]["nn"].lower()))
    nn = getattr(nn_module, config["model"]["nn"])(loader_train.shape_in,
                                                   loader_train.shape_out,
                                                   encoder, config).to(device)
    nn = torch.nn.DataParallel(nn)
    optimizer = Adam(nn.parameters(), lr=config["model"]["lr"])

    resume = 0
    if args.checkpoint:
        chkpt = torch.load(os.path.expanduser(args.checkpoint),
                           map_location=device)
        nn.load_state_dict(chkpt["state_dict"])
        log.log("--- Using Checkpoint ---")
        log.log("Path:\t\t {}".format(args.checkpoint))
        log.log("UUID:\t\t {}".format(chkpt["uuid"]))

        if args.resume:
            optimizer.load_state_dict(chkpt["optimizer"])
            resume = chkpt["epoch"]
            assert resume < args.epochs, "Epoch asked, already reached by the given checkpoint"

    loss_module = load_module("robosat_pink.losses.{}".format(
        config["model"]["loss"].lower()))
    criterion = getattr(loss_module, config["model"]["loss"])().to(device)

    bs = config["model"]["bs"]
    train_loader = DataLoader(loader_train,
                              batch_size=bs,
                              shuffle=True,
                              drop_last=True,
                              num_workers=args.workers)
    val_loader = DataLoader(loader_val,
                            batch_size=bs,
                            shuffle=False,
                            drop_last=True,
                            num_workers=args.workers)

    if args.no_training:
        epoch = 0
        process(val_loader, config, log, csv_val, epoch, device, nn, criterion,
                "eval")
        sys.exit()

    for epoch in range(resume + 1, args.epochs + 1):  # 1-N based
        UUID = uuid.uuid1()
        log.log("---{}Epoch: {}/{} -- UUID: {}".format(os.linesep, epoch,
                                                       args.epochs, UUID))

        process(train_loader, config, log, csv_train, epoch, device, nn,
                criterion, "train", optimizer)

        try:  # https://github.com/pytorch/pytorch/issues/9176
            nn_doc = nn.module.doc
            nn_version = nn.module.version
        except AttributeError:
            nn_doc = nn.doc
            nn_version = nn.version

        states = {
            "uuid": UUID,
            "model_version": nn_version,
            "producer_name": "RoboSat.pink",
            "producer_version": rsp.__version__,
            "model_licence": "MIT",
            "domain": "pink.RoboSat",  # reverse-DNS
            "doc_string": nn_doc,
            "shape_in": loader_train.shape_in,
            "shape_out": loader_train.shape_out,
            "state_dict": nn.state_dict(),
            "epoch": epoch,
            "nn": config["model"]["nn"],
            "encoder": config["model"]["encoder"],
            "optimizer": optimizer.state_dict(),
            "loader": config["model"]["loader"],
        }
        checkpoint_path = os.path.join(args.out,
                                       "checkpoint-{:05d}.pth".format(epoch))
        if epoch == args.epochs or not (epoch % args.saving):
            log.log("[Saving checkpoint]")
            torch.save(states, checkpoint_path)

        if not args.no_validation:
            process(val_loader, config, log, csv_val, epoch, device, nn,
                    criterion, "eval")
Пример #2
0
def main(args):
    config = load_config(args.config)
    args.out = os.path.expanduser(args.out)
    args.workers = torch.cuda.device_count() * 2 if torch.device("cuda") and not args.workers else args.workers
    config["model"]["loader"] = args.loader if args.loader else config["model"]["loader"]
    config["model"]["bs"] = args.bs if args.bs else config["model"]["bs"]
    config["model"]["lr"] = args.lr if args.lr else config["model"]["lr"]
    config["model"]["ts"] = args.ts if args.ts else config["model"]["ts"]
    config["model"]["nn"] = args.nn if args.nn else config["model"]["nn"]
    config["model"]["loss"] = args.loss if args.loss else config["model"]["loss"]
    config["model"]["da"] = args.da if args.da else config["model"]["da"]
    config["model"]["dap"] = args.dap if args.dap else config["model"]["dap"]
    check_classes(config)
    check_channels(config)
    check_model(config)

    if not os.path.isdir(os.path.expanduser(args.dataset)):
        sys.exit("ERROR: dataset {} is not a directory".format(args.dataset))

    log = Logs(os.path.join(args.out, "log"))

    if torch.cuda.is_available():
        log.log("RoboSat.pink - training on {} GPUs, with {} workers".format(torch.cuda.device_count(), args.workers))
        log.log("(Torch:{} Cuda:{} CudNN:{})".format(torch.__version__, torch.version.cuda, torch.backends.cudnn.version()))
        device = torch.device("cuda")
        torch.backends.cudnn.benchmark = True
    else:
        log.log("RoboSat.pink - training on CPU, with {} workers - (Torch:{})".format(args.workers, torch.__version__))
        log.log("WARNING: Are you really sure sure about not training on GPU ?")
        device = torch.device("cpu")

    loader = load_module("robosat_pink.loaders.{}".format(config["model"]["loader"].lower()))
    loader_train = getattr(loader, config["model"]["loader"])(
        config, config["model"]["ts"], os.path.join(args.dataset, "training"), "train"
    )
    loader_val = getattr(loader, config["model"]["loader"])(
        config, config["model"]["ts"], os.path.join(args.dataset, "validation"), "train"
    )

    model_module = load_module("robosat_pink.models.{}".format(config["model"]["nn"].lower()))

    nn = getattr(model_module, config["model"]["nn"])(loader_train.shape_in, loader_train.shape_out, config).to(device)
    nn = torch.nn.DataParallel(nn)
    optimizer = Adam(nn.parameters(), lr=config["model"]["lr"])

    resume = 0
    if args.checkpoint:
        chkpt = torch.load(os.path.expanduser(args.checkpoint), map_location=device)
        nn.load_state_dict(chkpt["state_dict"])
        log.log("Using checkpoint: {}".format(args.checkpoint))

        if args.resume:
            optimizer.load_state_dict(chkpt["optimizer"])
            resume = chkpt["epoch"]
            if resume >= args.epochs:
                sys.exit("ERROR: Epoch {} already reached by the given checkpoint".format(config["model"]["epochs"]))

    loss_module = load_module("robosat_pink.losses.{}".format(config["model"]["loss"].lower()))
    criterion = getattr(loss_module, config["model"]["loss"])().to(device)

    bs = config["model"]["bs"]
    train_loader = DataLoader(loader_train, batch_size=bs, shuffle=True, drop_last=True, num_workers=args.workers)
    val_loader = DataLoader(loader_val, batch_size=bs, shuffle=False, drop_last=True, num_workers=args.workers)

    log.log("--- Input tensor from Dataset: {} ---".format(args.dataset))
    num_channel = 1  # 1-based numerotation
    for channel in config["channels"]:
        for band in channel["bands"]:
            log.log("Channel {}:\t\t {}[band: {}]".format(num_channel, channel["name"], band))
            num_channel += 1

    log.log("--- Hyper Parameters ---")
    for hp in config["model"]:
        log.log("{}{}".format(hp.ljust(25, " "), config["model"][hp]))

    for epoch in range(resume, args.epochs):
        UUID = uuid.uuid1()
        log.log("---{}Epoch: {}/{} -- UUID: {}".format(os.linesep, epoch + 1, args.epochs, UUID))

        process(train_loader, config, log, device, nn, criterion, "train", optimizer)
        if not args.no_validation:
            process(val_loader, config, log, device, nn, criterion, "eval")

        try:  # https://github.com/pytorch/pytorch/issues/9176
            nn_doc = nn.module.doc
            nn_version = nn.module.version
        except AttributeError:
            nn_version = nn.version
            nn_doc == nn.doc

        states = {
            "uuid": UUID,
            "model_version": nn_version,
            "producer_name": "RoboSat.pink",
            "producer_version": "0.4.0",
            "model_licence": "MIT",
            "domain": "pink.RoboSat",  # reverse-DNS
            "doc_string": nn_doc,
            "shape_in": loader_train.shape_in,
            "shape_out": loader_train.shape_out,
            "state_dict": nn.state_dict(),
            "epoch": epoch + 1,
            "nn": config["model"]["nn"],
            "optimizer": optimizer.state_dict(),
            "loader": config["model"]["loader"],
        }
        checkpoint_path = os.path.join(args.out, "checkpoint-{:05d}.pth".format(epoch + 1))
        torch.save(states, checkpoint_path)