예제 #1
0
def train_model():
    """Trains the model."""

    # Build the model (before the loaders to speed up debugging)
    model = model_builder.build_model()
    log_model_info(model)

    # Define the loss function
    loss_fun = losses.get_loss_fun()
    # Construct the optimizer
    optimizer = optim.construct_optimizer(model)

    # Load checkpoint or initial weights
    start_epoch = 0
    if cfg.TRAIN.AUTO_RESUME and cu.has_checkpoint():
        last_checkpoint = cu.get_last_checkpoint()
        checkpoint_epoch = cu.load_checkpoint(last_checkpoint, model,
                                              optimizer)
        logger.info("Loaded checkpoint from: {}".format(last_checkpoint))
        start_epoch = checkpoint_epoch + 1
    elif cfg.TRAIN.WEIGHTS:
        cu.load_checkpoint(cfg.TRAIN.WEIGHTS, model)
        logger.info("Loaded initial weights from: {}".format(
            cfg.TRAIN.WEIGHTS))

    # Compute precise time
    if start_epoch == 0 and cfg.PREC_TIME.ENABLED:
        logger.info("Computing precise time...")
        bu.compute_precise_time(model, loss_fun)
        nu.reset_bn_stats(model)

    # Create data loaders
    train_loader = loader.construct_train_loader()
    test_loader = loader.construct_test_loader()

    # Create meters
    train_meter = TrainMeter(len(train_loader))
    test_meter = TestMeter(len(test_loader))

    # Perform the training loop
    logger.info("Start epoch: {}".format(start_epoch + 1))

    for cur_epoch in range(start_epoch, cfg.OPTIM.MAX_EPOCH):
        # Train for one epoch
        train_epoch(train_loader, model, loss_fun, optimizer, train_meter,
                    cur_epoch)
        # Compute precise BN stats
        if cfg.BN.USE_PRECISE_STATS:
            nu.compute_precise_bn_stats(model, train_loader)
        # Save a checkpoint
        if cu.is_checkpoint_epoch(cur_epoch):
            checkpoint_file = cu.save_checkpoint(model, optimizer, cur_epoch)
            logger.info("Wrote checkpoint to: {}".format(checkpoint_file))
        # Evaluate the model
        if is_eval_epoch(cur_epoch):
            test_epoch(test_loader, model, test_meter, cur_epoch)
예제 #2
0
def train_model():
    """Trains the model."""
    # Setup training/testing environment
    setup_env()
    # Construct the model, ema, loss_fun, and optimizer
    model = setup_model()
    ema = deepcopy(model)
    loss_fun = builders.build_loss_fun().cuda()
    optimizer = optim.construct_optimizer(model)
    # Load checkpoint or initial weights
    start_epoch = 0
    if cfg.TRAIN.AUTO_RESUME and cp.has_checkpoint():
        file = cp.get_last_checkpoint()
        epoch = cp.load_checkpoint(file, model, ema, optimizer)[0]
        logger.info("Loaded checkpoint from: {}".format(file))
        start_epoch = epoch + 1
    elif cfg.TRAIN.WEIGHTS:
        train_weights = get_weights_file(cfg.TRAIN.WEIGHTS)
        cp.load_checkpoint(train_weights, model, ema)
        logger.info("Loaded initial weights from: {}".format(train_weights))
    # Create data loaders and meters
    train_loader = data_loader.construct_train_loader()
    test_loader = data_loader.construct_test_loader()
    train_meter = meters.TrainMeter(len(train_loader))
    test_meter = meters.TestMeter(len(test_loader))
    ema_meter = meters.TestMeter(len(test_loader), "test_ema")
    # Create a GradScaler for mixed precision training
    scaler = amp.GradScaler(enabled=cfg.TRAIN.MIXED_PRECISION)
    # Compute model and loader timings
    if start_epoch == 0 and cfg.PREC_TIME.NUM_ITER > 0:
        benchmark.compute_time_full(model, loss_fun, train_loader, test_loader)
    # Perform the training loop
    logger.info("Start epoch: {}".format(start_epoch + 1))
    for cur_epoch in range(start_epoch, cfg.OPTIM.MAX_EPOCH):
        # Train for one epoch
        params = (train_loader, model, ema, loss_fun, optimizer, scaler,
                  train_meter)
        train_epoch(*params, cur_epoch)
        # Compute precise BN stats
        if cfg.BN.USE_PRECISE_STATS:
            net.compute_precise_bn_stats(model, train_loader)
            net.compute_precise_bn_stats(ema, train_loader)
        # Evaluate the model
        test_epoch(test_loader, model, test_meter, cur_epoch)
        test_epoch(test_loader, ema, ema_meter, cur_epoch)
        test_err = test_meter.get_epoch_stats(cur_epoch)["top1_err"]
        ema_err = ema_meter.get_epoch_stats(cur_epoch)["top1_err"]
        # Save a checkpoint
        file = cp.save_checkpoint(model, ema, optimizer, cur_epoch, test_err,
                                  ema_err)
        logger.info("Wrote checkpoint to: {}".format(file))
예제 #3
0
def train_model():
    """Trains the model."""
    # Setup training/testing environment
    setup_env()
    # Construct the model, loss_fun, and optimizer
    model = setup_model()
    loss_fun = builders.build_loss_fun().cuda()
    optimizer = optim.construct_optimizer(model)
    # Load checkpoint or initial weights
    start_epoch = 0
    if cfg.TRAIN.AUTO_RESUME and checkpoint.has_checkpoint():
        last_checkpoint = checkpoint.get_last_checkpoint()
        checkpoint_epoch = checkpoint.load_checkpoint(last_checkpoint, model,
                                                      optimizer)
        logger.info("Loaded checkpoint from: {}".format(last_checkpoint))
        start_epoch = checkpoint_epoch + 1
    elif cfg.TRAIN.WEIGHTS:
        checkpoint.load_checkpoint(cfg.TRAIN.WEIGHTS, model)
        logger.info("Loaded initial weights from: {}".format(
            cfg.TRAIN.WEIGHTS))
    # Create data loaders and meters
    train_loader = loader.construct_train_loader()
    test_loader = loader.construct_test_loader()
    train_meter = meters.TrainMeter(len(train_loader))
    test_meter = meters.TestMeter(len(test_loader))
    # Compute model and loader timings
    if start_epoch == 0 and cfg.PREC_TIME.NUM_ITER > 0:
        benchmark.compute_time_full(model, loss_fun, train_loader, test_loader)
    # Perform the training loop
    logger.info("Start epoch: {}".format(start_epoch + 1))
    for cur_epoch in range(start_epoch, cfg.OPTIM.MAX_EPOCH):
        if hasattr(cfg, 'search_epoch'):
            if cur_epoch >= cfg.search_epoch:
                break
        # Train for one epoch
        train_epoch(train_loader, model, loss_fun, optimizer, train_meter,
                    cur_epoch)
        # Compute precise BN stats
        if cfg.BN.USE_PRECISE_STATS:
            net.compute_precise_bn_stats(model, train_loader)
        # Save a checkpoint
        if (cur_epoch + 1) % cfg.TRAIN.CHECKPOINT_PERIOD == 0:
            checkpoint_file = checkpoint.save_checkpoint(
                model, optimizer, cur_epoch)
            logger.info("Wrote checkpoint to: {}".format(checkpoint_file))
        # Evaluate the model
        next_epoch = cur_epoch + 1
        if next_epoch % cfg.TRAIN.EVAL_PERIOD == 0 or next_epoch == cfg.OPTIM.MAX_EPOCH:
            stats = test_epoch(test_loader, model, test_meter, cur_epoch)
            nni.report_intermediate_result(stats['top1_err'])
    nni.report_final_result(test_meter.min_top1_err)
예제 #4
0
def train_kd_model():
    """Trains the model."""
    # Setup training/testing environment
    setup_env()
    # Construct the model, loss_fun, and optimizer
    model = setup_model()
    loss_fun = builders.build_loss_fun().cuda()
    optimizer = optim.construct_optimizer(model)
    # Load checkpoint or initial weights
    start_epoch = 0
    if cfg.TRAIN.AUTO_RESUME and cp.has_checkpoint():
        file = cp.get_last_checkpoint()
        epoch = cp.load_checkpoint(file, model, optimizer)
        logger.info("Loaded checkpoint from: {}".format(file))
        start_epoch = epoch + 1
    elif cfg.TRAIN.WEIGHTS:
        cp.load_checkpoint(cfg.TRAIN.WEIGHTS, model, strict=False)
        logger.info("Loaded initial weights from: {}".format(
            cfg.TRAIN.WEIGHTS))
    # Create data loaders and meters
    train_loader = data_loader.construct_train_loader()
    test_loader = data_loader.construct_test_loader()
    train_meter = meters.TrainMeter(len(train_loader))
    test_meter = meters.TestMeter(len(test_loader))
    # Create a GradScaler for mixed precision training
    scaler = amp.GradScaler(enabled=cfg.TRAIN.MIXED_PRECISION)
    # Compute model and loader timings
    if start_epoch == 0 and cfg.PREC_TIME.NUM_ITER > 0:
        benchmark.compute_time_full(model, loss_fun, train_loader, test_loader)
    # Perform the training loop
    logger.info("Start epoch: {}".format(start_epoch + 1))
    best_err = np.inf
    # Create the teacher model
    teacher = setup_teacher_model()
    for cur_epoch in range(start_epoch, cfg.OPTIM.MAX_EPOCH):
        # Train for one epoch
        params = (train_loader, model, loss_fun, optimizer, scaler,
                  train_meter, teacher)
        train_kd_epoch(*params, cur_epoch)
        # Compute precise BN stats
        if cfg.BN.USE_PRECISE_STATS:
            net.compute_precise_bn_stats(model, train_loader)
        # Evaluate the model
        test_epoch(test_loader, model, test_meter, cur_epoch)
        # Check if checkpoint is best so far (note: should checkpoint meters as well)
        stats = test_meter.get_epoch_stats(cur_epoch)
        best = stats["top1_err"] <= best_err
        best_err = min(stats["top1_err"], best_err)
        # Save a checkpoint
        file = cp.save_checkpoint(model, optimizer, cur_epoch, best)
        logger.info("Wrote checkpoint to: {}".format(file))
예제 #5
0
def train_model():
    """Trains the model."""
    # Setup training/testing environment
    setup_env()
    # Construct the model, loss_fun, and optimizer
    model = setup_model()
    loss_fun = builders.build_loss_fun().cuda()
    optimizer = optim.construct_optimizer(model)
    # Load checkpoint or initial weights
    start_epoch = 0
    if cfg.TRAIN.AUTO_RESUME and checkpoint.has_checkpoint():
        last_checkpoint = checkpoint.get_last_checkpoint()
        checkpoint_epoch = checkpoint.load_checkpoint(last_checkpoint, model,
                                                      optimizer)
        logger.info("Loaded checkpoint from: {}".format(last_checkpoint))
        start_epoch = checkpoint_epoch + 1
    elif cfg.TRAIN.WEIGHTS:
        checkpoint.load_checkpoint(cfg.TRAIN.WEIGHTS, model)
        logger.info("Loaded initial weights from: {}".format(
            cfg.TRAIN.WEIGHTS))
    # Compute precise time
    if start_epoch == 0 and cfg.PREC_TIME.ENABLED:
        logger.info("Computing precise time...")
        prec_time = net.compute_precise_time(model, loss_fun)
        logger.info(logging.dump_json_stats(prec_time))
        net.reset_bn_stats(model)
    # Create data loaders and meters
    train_loader = loader.construct_train_loader()
    test_loader = loader.construct_test_loader()
    train_meter = meters.TrainMeter(len(train_loader))
    test_meter = meters.TestMeter(len(test_loader))
    # Perform the training loop
    logger.info("Start epoch: {}".format(start_epoch + 1))
    for cur_epoch in range(start_epoch, cfg.OPTIM.MAX_EPOCH):
        # Train for one epoch
        train_epoch(train_loader, model, loss_fun, optimizer, train_meter,
                    cur_epoch)
        # Compute precise BN stats
        if cfg.BN.USE_PRECISE_STATS:
            net.compute_precise_bn_stats(model, train_loader)
        # Save a checkpoint
        if (cur_epoch + 1) % cfg.TRAIN.CHECKPOINT_PERIOD == 0:
            checkpoint_file = checkpoint.save_checkpoint(
                model, optimizer, cur_epoch)
            logger.info("Wrote checkpoint to: {}".format(checkpoint_file))
        # Evaluate the model
        next_epoch = cur_epoch + 1
        if next_epoch % cfg.TRAIN.EVAL_PERIOD == 0 or next_epoch == cfg.OPTIM.MAX_EPOCH:
            test_epoch(test_loader, model, test_meter, cur_epoch)
예제 #6
0
def setup_model():
    """Sets up a model for training or testing and log the results."""
    # Build the model
    model = builders.build_model()
    logger.info("Model:\n{}".format(model)) if cfg.VERBOSE else ()
    # Log model complexity
    logger.info(logging.dump_log_data(net.complexity(model), "complexity"))
    # Transfer the model to the current GPU device
    err_str = "Cannot use more GPU devices than available"
    #assert cfg.NUM_GPUS <= torch.cuda.device_count(), err_str
    assert cfg.NUM_GPUS <= torch.npu.device_count(), err_str
    cur_device = torch.npu.current_device()
    model = model.to(cur_device)
    optimizer = optim.construct_optimizer(model)
    model, optimizer = amp.initialize(model,
                                      optimizer,
                                      opt_level="O2",
                                      loss_scale=128)
    if cfg.NUM_GPUS > 1:
        #Make model replica operate on the current device
        ddp = torch.nn.parallel.DistributedDataParallel
        model = ddp(model, device_ids=[cur_device], broadcast_buffers=False)

    return model, optimizer
예제 #7
0
def train_model():
    """Trains the model."""

    # Setup logging
    logging.setup_logging()
    # Show the config
    logger.info("Config:\n{}".format(cfg))

    # Fix the RNG seeds (see RNG comment in core/config.py for discussion)
    np.random.seed(cfg.RNG_SEED)
    torch.manual_seed(cfg.RNG_SEED)
    # Configure the CUDNN backend
    torch.backends.cudnn.benchmark = cfg.CUDNN.BENCHMARK

    # Build the model (before the loaders to speed up debugging)
    model = builders.build_model()
    logger.info("Model:\n{}".format(model))
    logger.info(logging.dump_json_stats(net.complexity(model)))

    # Define the loss function
    loss_fun = builders.build_loss_fun()
    # Construct the optimizer
    optimizer = optim.construct_optimizer(model)

    # Load checkpoint or initial weights
    start_epoch = 0
    if cfg.TRAIN.AUTO_RESUME and checkpoint.has_checkpoint():
        last_checkpoint = checkpoint.get_last_checkpoint()
        checkpoint_epoch = checkpoint.load_checkpoint(last_checkpoint, model,
                                                      optimizer)
        logger.info("Loaded checkpoint from: {}".format(last_checkpoint))
        start_epoch = checkpoint_epoch + 1
    elif cfg.TRAIN.WEIGHTS:
        checkpoint.load_checkpoint(cfg.TRAIN.WEIGHTS, model)
        logger.info("Loaded initial weights from: {}".format(
            cfg.TRAIN.WEIGHTS))

    # Compute precise time
    if start_epoch == 0 and cfg.PREC_TIME.ENABLED:
        logger.info("Computing precise time...")
        prec_time = net.compute_precise_time(model, loss_fun)
        logger.info(logging.dump_json_stats(prec_time))
        net.reset_bn_stats(model)

    # Create data loaders
    train_loader = loader.construct_train_loader()
    test_loader = loader.construct_test_loader()

    # Create meters
    train_meter = meters.TrainMeter(len(train_loader))
    test_meter = meters.TestMeter(len(test_loader))

    # Perform the training loop
    logger.info("Start epoch: {}".format(start_epoch + 1))

    for cur_epoch in range(start_epoch, cfg.OPTIM.MAX_EPOCH):
        # Train for one epoch
        train_epoch(train_loader, model, loss_fun, optimizer, train_meter,
                    cur_epoch)
        # Compute precise BN stats
        if cfg.BN.USE_PRECISE_STATS:
            net.compute_precise_bn_stats(model, train_loader)
        # Save a checkpoint
        if checkpoint.is_checkpoint_epoch(cur_epoch):
            checkpoint_file = checkpoint.save_checkpoint(
                model, optimizer, cur_epoch)
            logger.info("Wrote checkpoint to: {}".format(checkpoint_file))
        # Evaluate the model
        if is_eval_epoch(cur_epoch):
            test_epoch(test_loader, model, test_meter, cur_epoch)
예제 #8
0
def test_ftta_model(corruptions, levels):
    """Use feed back to fine-tune some part of the model. (with all kind of corruptions)"""
    all_results = []
    for corruption_level in levels:
        lvl_results = []
        for corruption_type in corruptions:
            cfg.TRAIN.CORRUPTION = corruption_type
            cfg.TRAIN.LEVEL = corruption_level
            cfg.TEST.CORRUPTION = corruption_type
            cfg.TEST.LEVEL = corruption_level

            # Setup training/testing environment
            setup_env()
            # Construct the model, loss_fun, and optimizer
            model = setup_model()
            loss_fun = builders.build_loss_fun().cuda()
            optimizer = optim.construct_optimizer(model)
            # Load checkpoint or initial weights
            start_epoch = 0
            checkpoint.load_checkpoint(cfg.TRAIN.WEIGHTS,
                                       model,
                                       strict=cfg.TRAIN.LOAD_STRICT)
            logger.info("Loaded initial weights from: {}".format(
                cfg.TRAIN.WEIGHTS))
            # Create data loaders and meters
            train_loader = loader.construct_train_loader()
            test_loader = loader.construct_test_loader()
            train_meter = meters.TrainMeter(len(train_loader))
            test_meter = meters.TestMeter(len(test_loader))
            # Compute model and loader timings
            if start_epoch == 0 and cfg.PREC_TIME.NUM_ITER > 0:
                benchmark.compute_time_full(model, loss_fun, train_loader,
                                            test_loader)

            # Perform the training loop
            logger.info("Start epoch: {}".format(start_epoch + 1))
            for cur_epoch in range(start_epoch, cfg.OPTIM.MAX_EPOCH):
                if cfg.TRAIN.ADAPTATION != 'test_only':
                    if cfg.TRAIN.ADAPTATION == 'update_bn':
                        bn_update(model, train_loader)
                    elif cfg.TRAIN.ADAPTATION == 'min_entropy':
                        # Train for one epoch
                        train_epoch(train_loader, model, loss_fun, optimizer,
                                    train_meter, cur_epoch)
                        bn_update(model, train_loader)

                    # Save a checkpoint
                    if (cur_epoch + 1) % cfg.TRAIN.CHECKPOINT_PERIOD == 0:
                        checkpoint_file = checkpoint.save_checkpoint(
                            model, optimizer, cur_epoch)
                        logger.info(
                            "Wrote checkpoint to: {}".format(checkpoint_file))

                # Evaluate the model
                next_epoch = cur_epoch + 1
                if next_epoch % cfg.TRAIN.EVAL_PERIOD == 0 or next_epoch == cfg.OPTIM.MAX_EPOCH:
                    top1 = test_epoch(test_loader, model, test_meter,
                                      cur_epoch)
            lvl_results.append(top1)
        all_results.append(lvl_results)

    for lvl_idx in range(len(all_results)):
        logger.info("corruption level: {}".format(levels[lvl_idx]))
        logger.info("corruption types: {}".format(corruptions))
        logger.info(all_results[lvl_idx])

    # show_parameters(model)

    return all_results
예제 #9
0
def train_model():
    """Trains the model."""
    # Setup training/testing environment
    setup_env()
    # Construct the model, loss_fun, and optimizer
    model = setup_model()
    loss_fun = builders.build_loss_fun().cuda()
    if "search" in cfg.MODEL.TYPE:
        params_w = [v for k, v in model.named_parameters() if "alphas" not in k]
        params_a = [v for k, v in model.named_parameters() if "alphas" in k]
        optimizer_w = torch.optim.SGD(
            params=params_w,
            lr=cfg.OPTIM.BASE_LR,
            momentum=cfg.OPTIM.MOMENTUM,
            weight_decay=cfg.OPTIM.WEIGHT_DECAY,
            dampening=cfg.OPTIM.DAMPENING,
            nesterov=cfg.OPTIM.NESTEROV
        )
        if cfg.OPTIM.ARCH_OPTIM == "adam":
            optimizer_a = torch.optim.Adam(
                params=params_a,
                lr=cfg.OPTIM.ARCH_BASE_LR,
                betas=(0.5, 0.999),
                weight_decay=cfg.OPTIM.ARCH_WEIGHT_DECAY
            )
        elif cfg.OPTIM.ARCH_OPTIM == "sgd":
            optimizer_a = torch.optim.SGD(
                params=params_a,
                lr=cfg.OPTIM.ARCH_BASE_LR,
                momentum=cfg.OPTIM.MOMENTUM,
                weight_decay=cfg.OPTIM.ARCH_WEIGHT_DECAY,
                dampening=cfg.OPTIM.DAMPENING,
                nesterov=cfg.OPTIM.NESTEROV
            )
        optimizer = [optimizer_w, optimizer_a]
    else:
        optimizer = optim.construct_optimizer(model)
    # Load checkpoint or initial weights
    start_epoch = 0
    if cfg.TRAIN.AUTO_RESUME and checkpoint.has_checkpoint():
        last_checkpoint = checkpoint.get_last_checkpoint()
        checkpoint_epoch = checkpoint.load_checkpoint(last_checkpoint, model, optimizer)
        logger.info("Loaded checkpoint from: {}".format(last_checkpoint))
        start_epoch = checkpoint_epoch + 1
    elif cfg.TRAIN.WEIGHTS:
        checkpoint.load_checkpoint(cfg.TRAIN.WEIGHTS, model)
        logger.info("Loaded initial weights from: {}".format(cfg.TRAIN.WEIGHTS))
    # Create data loaders and meters
    if cfg.TRAIN.PORTION < 1:
        if "search" in cfg.MODEL.TYPE:
            train_loader = [loader._construct_loader(
                dataset_name=cfg.TRAIN.DATASET,
                split=cfg.TRAIN.SPLIT,
                batch_size=int(cfg.TRAIN.BATCH_SIZE / cfg.NUM_GPUS),
                shuffle=True,
                drop_last=True,
                portion=cfg.TRAIN.PORTION,
                side="l"
            ),
            loader._construct_loader(
                dataset_name=cfg.TRAIN.DATASET,
                split=cfg.TRAIN.SPLIT,
                batch_size=int(cfg.TRAIN.BATCH_SIZE / cfg.NUM_GPUS),
                shuffle=True,
                drop_last=True,
                portion=cfg.TRAIN.PORTION,
                side="r"
            )]
        else:
            train_loader = loader._construct_loader(
                dataset_name=cfg.TRAIN.DATASET,
                split=cfg.TRAIN.SPLIT,
                batch_size=int(cfg.TRAIN.BATCH_SIZE / cfg.NUM_GPUS),
                shuffle=True,
                drop_last=True,
                portion=cfg.TRAIN.PORTION,
                side="l"
            )
        test_loader = loader._construct_loader(
            dataset_name=cfg.TRAIN.DATASET,
            split=cfg.TRAIN.SPLIT,
            batch_size=int(cfg.TRAIN.BATCH_SIZE / cfg.NUM_GPUS),
            shuffle=False,
            drop_last=False,
            portion=cfg.TRAIN.PORTION,
            side="r"
        )
    else:
        train_loader = loader.construct_train_loader()
        test_loader = loader.construct_test_loader()
    train_meter_type = meters.TrainMeterIoU if cfg.TASK == "seg" else meters.TrainMeter
    test_meter_type = meters.TestMeterIoU if cfg.TASK == "seg" else meters.TestMeter
    l = train_loader[0] if isinstance(train_loader, list) else train_loader
    train_meter = train_meter_type(len(l))
    test_meter = test_meter_type(len(test_loader))
    # Compute model and loader timings
    if start_epoch == 0 and cfg.PREC_TIME.NUM_ITER > 0:
        l = train_loader[0] if isinstance(train_loader, list) else train_loader
        benchmark.compute_time_full(model, loss_fun, l, test_loader)
    # Perform the training loop
    logger.info("Start epoch: {}".format(start_epoch + 1))
    for cur_epoch in range(start_epoch, cfg.OPTIM.MAX_EPOCH):
        # Train for one epoch
        f = search_epoch if "search" in cfg.MODEL.TYPE else train_epoch
        f(train_loader, model, loss_fun, optimizer, train_meter, cur_epoch)
        # Compute precise BN stats
        if cfg.BN.USE_PRECISE_STATS:
            net.compute_precise_bn_stats(model, train_loader)
        # Save a checkpoint
        if (cur_epoch + 1) % cfg.TRAIN.CHECKPOINT_PERIOD == 0:
            checkpoint_file = checkpoint.save_checkpoint(model, optimizer, cur_epoch)
            logger.info("Wrote checkpoint to: {}".format(checkpoint_file))
        # Evaluate the model
        next_epoch = cur_epoch + 1
        if next_epoch % cfg.TRAIN.EVAL_PERIOD == 0 or next_epoch == cfg.OPTIM.MAX_EPOCH:
            test_epoch(test_loader, model, test_meter, cur_epoch)
예제 #10
0
def main(cfg):

    # Setting up GPU args
    use_cuda = (cfg.NUM_GPUS > 0) and torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")
    kwargs = {
        'num_workers': cfg.DATA_LOADER.NUM_WORKERS,
        'pin_memory': cfg.DATA_LOADER.PIN_MEMORY
    } if use_cuda else {}

    # Using specific GPU
    os.environ['CUDA_VISIBLE_DEVICES'] = str(cfg.GPU_ID)
    print("Using GPU : {}.\n".format(cfg.GPU_ID))

    # Getting the output directory ready (default is "/output")
    cfg.OUT_DIR = os.path.join(os.path.abspath('..'), cfg.OUT_DIR)
    if not os.path.exists(cfg.OUT_DIR):
        os.mkdir(cfg.OUT_DIR)
    # Create "DATASET" specific directory
    dataset_out_dir = os.path.join(cfg.OUT_DIR, cfg.DATASET.NAME)
    if not os.path.exists(dataset_out_dir):
        os.mkdir(dataset_out_dir)
    # Creating the experiment directory inside the dataset specific directory
    # all logs, labeled, unlabeled, validation sets are stroed here
    # E.g., output/CIFAR10/{timestamp or cfg.EXP_NAME based on arguments passed}
    if cfg.EXP_NAME == 'auto':
        now = datetime.now()
        exp_dir = f'{now.year}_{now.month}_{now.day}_{now.hour}{now.minute}{now.second}'
    else:
        exp_dir = cfg.EXP_NAME

    exp_dir = os.path.join(dataset_out_dir, exp_dir)
    if not os.path.exists(exp_dir):
        os.mkdir(exp_dir)
        print("Experiment Directory is {}.\n".format(exp_dir))
    else:
        print(
            "Experiment Directory Already Exists: {}. Reusing it may lead to loss of old logs in the directory.\n"
            .format(exp_dir))
    cfg.EXP_DIR = exp_dir

    # Setup Logger
    lu.setup_logging(cfg)

    # Dataset preparing steps
    print("\n======== PREPARING DATA AND MODEL ========\n")
    cfg.DATASET.ROOT_DIR = os.path.join(os.path.abspath('..'),
                                        cfg.DATASET.ROOT_DIR)
    data_obj = Data(cfg)
    train_data, train_size = data_obj.getDataset(save_dir=cfg.DATASET.ROOT_DIR,
                                                 isTrain=True,
                                                 isDownload=True)
    test_data, test_size = data_obj.getDataset(save_dir=cfg.DATASET.ROOT_DIR,
                                               isTrain=False,
                                               isDownload=True)

    print(
        "\nDataset {} Loaded Sucessfully.\nTotal Train Size: {} and Total Test Size: {}\n"
        .format(cfg.DATASET.NAME, train_size, test_size))
    logger.info(
        "Dataset {} Loaded Sucessfully. Total Train Size: {} and Total Test Size: {}\n"
        .format(cfg.DATASET.NAME, train_size, test_size))

    trainSet_path, valSet_path = data_obj.makeTVSets(train_split_ratio=cfg.ACTIVE_LEARNING.INIT_RATIO, \
        val_split_ratio=cfg.DATASET.VAL_RATIO, data=train_data, seed_id=cfg.RNG_SEED, save_dir=cfg.EXP_DIR)

    trainSet, valSet = data_obj.loadTVPartitions(trainSetPath=trainSet_path,
                                                 valSetPath=valSet_path)

    print("Data Partitioning Complete. \nTrain Set: {},  Validation Set: {}\n".
          format(len(trainSet), len(valSet)))
    logger.info("\nTrain Set: {},  Validation Set: {}\n".format(
        len(trainSet), len(valSet)))

    # Preparing dataloaders for initial training
    trainSet_loader = data_obj.getIndexesDataLoader(
        indexes=trainSet, batch_size=cfg.TRAIN.BATCH_SIZE, data=train_data)
    valSet_loader = data_obj.getIndexesDataLoader(
        indexes=valSet, batch_size=cfg.TRAIN.BATCH_SIZE, data=train_data)
    test_loader = data_obj.getTestLoader(data=test_data,
                                         test_batch_size=cfg.TRAIN.BATCH_SIZE,
                                         seed_id=cfg.RNG_SEED)

    # Initialize the models
    num_ensembles = cfg.ENSEMBLE.NUM_MODELS
    models = []
    for i in range(num_ensembles):
        models.append(model_builder.build_model(cfg))

    print("{} ensemble models of type: {}\n".format(cfg.ENSEMBLE.NUM_MODELS,
                                                    cfg.ENSEMBLE.MODEL_TYPE))
    logger.info("{} ensemble models of type: {}\n".format(
        cfg.ENSEMBLE.NUM_MODELS, cfg.ENSEMBLE.MODEL_TYPE))

    # This is to seamlessly use the code originally written for AL episodes
    cfg.EPISODE_DIR = cfg.EXP_DIR

    # Train models
    print("======== ENSEMBLE TRAINING ========")
    logger.info("======== ENSEMBLE TRAINING ========")

    best_model_paths = []
    test_accs = []
    for i in range(num_ensembles):
        print("=== Training ensemble [{}/{}] ===".format(i + 1, num_ensembles))

        # Construct the optimizer
        optimizer = optim.construct_optimizer(cfg, models[i])
        print("optimizer: {}\n".format(optimizer))
        logger.info("optimizer: {}\n".format(optimizer))

        # Each ensemble gets its own output directory
        cfg.EPISODE_DIR = os.path.join(cfg.EPISODE_DIR,
                                       'model_{}   '.format(i + 1))

        # Train the model
        best_val_acc, best_val_epoch, checkpoint_file = ensemble_train_model(
            trainSet_loader, valSet_loader, models[i], optimizer, cfg)
        best_model_paths.append(checkpoint_file)

        print("Best Validation Accuracy by Model {}: {}\nBest Epoch: {}\n".
              format(i + 1, round(best_val_acc, 4), best_val_epoch))
        logger.info(
            "Best Validation Accuracy by Model {}: {}\tBest Epoch: {}\n".
            format(i + 1, round(best_val_acc, 4), best_val_epoch))

        # Test the model
        print("=== Testing ensemble [{}/{}] ===".format(i + 1, num_ensembles))
        test_acc = ensemble_test_model(test_loader,
                                       checkpoint_file,
                                       cfg,
                                       cur_episode=0)
        test_accs.append(test_acc)

        print("Test Accuracy by Model {}: {}.\n".format(
            i + 1, round(test_acc, 4)))
        logger.info("Test Accuracy by Model {}: {}.\n".format(i + 1, test_acc))

        # Reset EPISODE_DIR
        cfg.EPISODE_DIR = cfg.EXP_DIR

    # Test each best model checkpoint and report the average
    print("======== ENSEMBLE TESTING ========\n")
    logger.info("======== ENSEMBLE TESTING ========\n")

    mean_test_acc = np.mean(test_accs)
    print("Average Ensemble Test Accuracy: {}.\n".format(
        round(mean_test_acc, 4)))
    logger.info("Average Ensemble Test Accuracy: {}.\n".format(mean_test_acc))

    print("================================\n\n")
    logger.info("================================\n\n")
예제 #11
0
def main(cfg):

    # Setting up GPU args
    use_cuda = (cfg.NUM_GPUS > 0) and torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")
    kwargs = {
        'num_workers': cfg.DATA_LOADER.NUM_WORKERS,
        'pin_memory': cfg.DATA_LOADER.PIN_MEMORY
    } if use_cuda else {}

    # Using specific GPU
    os.environ['CUDA_VISIBLE_DEVICES'] = str(cfg.GPU_ID)
    print("Using GPU : {}.\n".format(cfg.GPU_ID))

    # Getting the output directory ready (default is "/output")
    cfg.OUT_DIR = os.path.join(os.path.abspath('..'), cfg.OUT_DIR)
    if not os.path.exists(cfg.OUT_DIR):
        os.mkdir(cfg.OUT_DIR)
    # Create "DATASET" specific directory
    dataset_out_dir = os.path.join(cfg.OUT_DIR, cfg.DATASET.NAME,
                                   cfg.MODEL.TYPE)
    if not os.path.exists(dataset_out_dir):
        os.makedirs(dataset_out_dir)
    # Creating the experiment directory inside the dataset specific directory
    # all logs, labeled, unlabeled, validation sets are stroed here
    # E.g., output/CIFAR10/resnet18/{timestamp or cfg.EXP_NAME based on arguments passed}
    if cfg.EXP_NAME == 'auto':
        now = datetime.now()
        exp_dir = f'{now.year}_{now.month}_{now.day}_{now.hour}{now.minute}{now.second}'
    else:
        exp_dir = cfg.EXP_NAME

    exp_dir = os.path.join(dataset_out_dir, exp_dir)
    if not os.path.exists(exp_dir):
        os.mkdir(exp_dir)
        print("Experiment Directory is {}.\n".format(exp_dir))
    else:
        print(
            "Experiment Directory Already Exists: {}. Reusing it may lead to loss of old logs in the directory.\n"
            .format(exp_dir))
    cfg.EXP_DIR = exp_dir

    # Setup Logger
    lu.setup_logging(cfg)

    # Dataset preparing steps
    print("\n======== PREPARING DATA AND MODEL ========\n")
    cfg.DATASET.ROOT_DIR = os.path.join(os.path.abspath('..'),
                                        cfg.DATASET.ROOT_DIR)
    data_obj = Data(cfg)
    train_data, train_size = data_obj.getDataset(save_dir=cfg.DATASET.ROOT_DIR,
                                                 isTrain=True,
                                                 isDownload=True)
    test_data, test_size = data_obj.getDataset(save_dir=cfg.DATASET.ROOT_DIR,
                                               isTrain=False,
                                               isDownload=True)

    print(
        "\nDataset {} Loaded Sucessfully.\nTotal Train Size: {} and Total Test Size: {}\n"
        .format(cfg.DATASET.NAME, train_size, test_size))
    logger.info(
        "Dataset {} Loaded Sucessfully. Total Train Size: {} and Total Test Size: {}\n"
        .format(cfg.DATASET.NAME, train_size, test_size))

    print("\nSampling Initial Pool using {}.".format(
        str.upper(cfg.INIT_POOL.SAMPLING_FN)))
    logger.info("\nSampling Initial Pool using {}.".format(
        str.upper(cfg.INIT_POOL.SAMPLING_FN)))
    if cfg.INIT_POOL.SAMPLING_FN == 'random':
        lSet_path, uSet_path, valSet_path = data_obj.makeLUVSets(train_split_ratio=cfg.INIT_POOL.INIT_RATIO, \
     val_split_ratio=cfg.DATASET.VAL_RATIO, data=train_data, seed_id=cfg.RNG_SEED, save_dir=cfg.EXP_DIR)
    else:
        lSet, uSet = InitialPool(cfg).sample_from_uSet(train_data)
        lSet_path = f'{cfg.EXP_DIR}/lSet.npy'
        np.save(lSet_path, lSet)
        np.save(f'{cfg.EXP_DIR}/lSet_initial.npy', lSet)
        uSet_path, valSet_path = data_obj.makeUVSets(
            val_split_ratio=cfg.DATASET.VAL_RATIO,
            data=uSet,
            seed_id=cfg.RNG_SEED,
            save_dir=cfg.EXP_DIR)

    cfg.ACTIVE_LEARNING.LSET_PATH = lSet_path
    cfg.ACTIVE_LEARNING.USET_PATH = uSet_path
    cfg.ACTIVE_LEARNING.VALSET_PATH = valSet_path

    lSet, uSet, valSet = data_obj.loadPartitions(lSetPath=cfg.ACTIVE_LEARNING.LSET_PATH, \
            uSetPath=cfg.ACTIVE_LEARNING.USET_PATH, valSetPath = cfg.ACTIVE_LEARNING.VALSET_PATH)

    print(
        "Data Partitioning Complete. \nLabeled Set: {}, Unlabeled Set: {}, Validation Set: {}\n"
        .format(len(lSet), len(uSet), len(valSet)))
    logger.info(
        "Labeled Set: {}, Unlabeled Set: {}, Validation Set: {}\n".format(
            len(lSet), len(uSet), len(valSet)))

    # Preparing dataloaders for initial training
    lSet_loader = data_obj.getIndexesDataLoader(
        indexes=lSet, batch_size=cfg.TRAIN.BATCH_SIZE, data=train_data)
    valSet_loader = data_obj.getIndexesDataLoader(
        indexes=valSet, batch_size=cfg.TRAIN.BATCH_SIZE, data=train_data)
    uSet_loader = data_obj.getIndexesDataLoader(
        indexes=uSet, batch_size=cfg.TRAIN.BATCH_SIZE, data=train_data)
    test_loader = data_obj.getTestLoader(data=test_data,
                                         test_batch_size=cfg.TRAIN.BATCH_SIZE,
                                         seed_id=cfg.RNG_SEED)

    # Initialize the models
    num_ensembles = cfg.ENSEMBLE.NUM_MODELS
    models = []
    for i in range(num_ensembles):
        models.append(model_builder.build_model(cfg))
    print("{} ensemble models of type: {}\n".format(cfg.ENSEMBLE.NUM_MODELS,
                                                    cfg.ENSEMBLE.MODEL_TYPE))
    logger.info("{} ensemble models of type: {}\n".format(
        cfg.ENSEMBLE.NUM_MODELS, cfg.ENSEMBLE.MODEL_TYPE))

    print("Max AL Episodes: {}\n".format(cfg.ACTIVE_LEARNING.MAX_ITER))
    logger.info("Max AL Episodes: {}\n".format(cfg.ACTIVE_LEARNING.MAX_ITER))

    for cur_episode in range(0, cfg.ACTIVE_LEARNING.MAX_ITER + 1):

        wandb.log({"Episode": cur_episode})

        print("======== EPISODE {} BEGINS ========\n".format(cur_episode))
        logger.info(
            "======== EPISODE {} BEGINS ========\n".format(cur_episode))

        # Creating output directory for the episode
        episode_dir = os.path.join(cfg.EXP_DIR, f'episode_{cur_episode}')
        if not os.path.exists(episode_dir):
            os.mkdir(episode_dir)
        cfg.EPISODE_DIR = episode_dir

        # Train models
        print("======== ENSEMBLE TRAINING ========")
        logger.info("======== ENSEMBLE TRAINING ========")

        best_model_paths = []
        test_accs = []
        for i in range(num_ensembles):
            print("=== Training ensemble [{}/{}] ===".format(
                i + 1, num_ensembles))

            # Construct the optimizer
            optimizer = optim.construct_optimizer(cfg, models[i])
            print("optimizer: {}\n".format(optimizer))
            logger.info("optimizer: {}\n".format(optimizer))

            # Each ensemble gets its own output directory
            cfg.EPISODE_DIR = os.path.join(cfg.EPISODE_DIR,
                                           'model_{}'.format(i + 1))

            # Train the model
            best_val_acc, best_val_epoch, checkpoint_file = ensemble_train_model(
                lSet_loader, valSet_loader, models[i], optimizer, cfg)
            best_model_paths.append(checkpoint_file)

            print("Best Validation Accuracy by Model {}: {}\nBest Epoch: {}\n".
                  format(i + 1, round(best_val_acc, 4), best_val_epoch))
            logger.info(
                "EPISODE {} Best Validation Accuracy by Model {}: {}\tBest Epoch: {}\n"
                .format(cur_episode, i + 1, round(best_val_acc, 4),
                        best_val_epoch))

            # Test the model
            print("=== Testing ensemble [{}/{}] ===".format(
                i + 1, num_ensembles))
            test_acc = ensemble_test_model(test_loader, checkpoint_file, cfg,
                                           cur_episode)
            test_accs.append(test_acc)

            print("Test Accuracy by Model {}: {}.\n".format(
                i + 1, round(test_acc, 4)))
            logger.info("EPISODE {} Test Accuracy by Model {}: {}.\n".format(
                cur_episode, i + 1, test_acc))

            # Reset EPISODE_DIR
            cfg.EPISODE_DIR = episode_dir

        # Test each best model checkpoint and report the average
        print("======== ENSEMBLE TESTING ========\n")
        logger.info("======== ENSEMBLE TESTING ========\n")
        mean_test_acc = np.mean(test_accs)
        print("Average Ensemble Test Accuracy: {}.\n".format(
            round(mean_test_acc, 4)))
        logger.info("EPISODE {} Average Ensemble Test Accuracy: {}.\n".format(
            cur_episode, mean_test_acc))
        wandb.log({"Test Accuracy": mean_test_acc})

        global plot_episode_xvalues
        global plot_episode_yvalues

        global plot_epoch_xvalues
        global plot_epoch_yvalues

        global plot_it_x_values
        global plot_it_y_values

        plot_episode_xvalues.append(cur_episode)
        plot_episode_yvalues.append(mean_test_acc)

        plot_arrays(x_vals=plot_episode_xvalues, y_vals=plot_episode_yvalues, \
            x_name="Episodes", y_name="Test Accuracy", dataset_name=cfg.DATASET.NAME, out_dir=cfg.EXP_DIR)

        save_plot_values([plot_episode_xvalues, plot_episode_yvalues], \
            ["plot_episode_xvalues", "plot_episode_yvalues"], out_dir=cfg.EXP_DIR, saveInTextFormat=True)

        # No need to perform active sampling in the last episode iteration
        if cur_episode == cfg.ACTIVE_LEARNING.MAX_ITER:
            break

        # Active Sample
        print("======== ENSEMBLE ACTIVE SAMPLING ========\n")
        logger.info("======== ENSEMBLE ACTIVE SAMPLING ========\n")
        al_obj = ActiveLearning(data_obj, cfg)
        clf_models = []
        for i in range(num_ensembles):
            temp = model_builder.build_model(cfg)
            clf_models.append(cu.load_checkpoint(best_model_paths[i], temp))

        activeSet, new_uSet = al_obj.sample_from_uSet(
            None, lSet, uSet, train_data, supportingModels=clf_models)

        # Save current lSet, new_uSet and activeSet in the episode directory
        data_obj.saveSets(lSet, uSet, activeSet, cfg.EPISODE_DIR)

        # Add activeSet to lSet, save new_uSet as uSet and update dataloader for the next episode
        lSet = np.append(lSet, activeSet)
        uSet = new_uSet

        lSet_loader = data_obj.getIndexesDataLoader(
            indexes=lSet, batch_size=cfg.TRAIN.BATCH_SIZE, data=train_data)
        valSet_loader = data_obj.getIndexesDataLoader(
            indexes=valSet, batch_size=cfg.TRAIN.BATCH_SIZE, data=train_data)
        uSet_loader = data_obj.getSequentialDataLoader(
            indexes=uSet, batch_size=cfg.TRAIN.BATCH_SIZE, data=train_data)

        print(
            "Ensemble Active Sampling Complete. After Episode {}:\nNew Labeled Set: {}, New Unlabeled Set: {}, Active Set: {}\n"
            .format(cur_episode, len(lSet), len(uSet), len(activeSet)))
        logger.info(
            "Ensemble Active Sampling Complete. After Episode {}:\nNew Labeled Set: {}, New Unlabeled Set: {}, Active Set: {}\n"
            .format(cur_episode, len(lSet), len(uSet), len(activeSet)))
        print("================================\n\n")
        logger.info("================================\n\n")
예제 #12
0
def main(cfg):

    # Setting up GPU args
    use_cuda = (cfg.NUM_GPUS > 0) and torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")
    kwargs = {
        'num_workers': cfg.DATA_LOADER.NUM_WORKERS,
        'pin_memory': cfg.DATA_LOADER.PIN_MEMORY
    } if use_cuda else {}

    # Auto assign a RNG_SEED when not supplied a value
    if cfg.RNG_SEED is None:
        cfg.RNG_SEED = np.random.randint(100)

    # Using specific GPU
    # os.environ['NVIDIA_VISIBLE_DEVICES'] = str(cfg.GPU_ID)
    # os.environ['CUDA_VISIBLE_DEVICES'] = '0'
    # print("Using GPU : {}.\n".format(cfg.GPU_ID))

    # Getting the output directory ready (default is "/output")
    cfg.OUT_DIR = os.path.join(os.path.abspath('..'), cfg.OUT_DIR)
    if not os.path.exists(cfg.OUT_DIR):
        os.mkdir(cfg.OUT_DIR)
    # Create "DATASET/MODEL TYPE" specific directory
    dataset_out_dir = os.path.join(cfg.OUT_DIR, cfg.DATASET.NAME,
                                   cfg.MODEL.TYPE)
    if not os.path.exists(dataset_out_dir):
        os.makedirs(dataset_out_dir)
    # Creating the experiment directory inside the dataset specific directory
    # all logs, labeled, unlabeled, validation sets are stroed here
    # E.g., output/CIFAR10/resnet18/{timestamp or cfg.EXP_NAME based on arguments passed}
    if cfg.EXP_NAME == 'auto':
        now = datetime.now()
        exp_dir = f'{now.year}_{now.month}_{now.day}_{now.hour}{now.minute}{now.second}'
    else:
        exp_dir = cfg.EXP_NAME

    exp_dir = os.path.join(dataset_out_dir, exp_dir)
    if not os.path.exists(exp_dir):
        os.mkdir(exp_dir)
        print("Experiment Directory is {}.\n".format(exp_dir))
    else:
        print(
            "Experiment Directory Already Exists: {}. Reusing it may lead to loss of old logs in the directory.\n"
            .format(exp_dir))
    cfg.EXP_DIR = exp_dir

    # Save the config file in EXP_DIR
    dump_cfg(cfg)

    # Setup Logger
    lu.setup_logging(cfg)

    # Dataset preparing steps
    print("\n======== PREPARING DATA AND MODEL ========\n")
    cfg.DATASET.ROOT_DIR = os.path.join(os.path.abspath('..'),
                                        cfg.DATASET.ROOT_DIR)
    data_obj = Data(cfg)
    train_data, train_size = data_obj.getDataset(save_dir=cfg.DATASET.ROOT_DIR,
                                                 isTrain=True,
                                                 isDownload=True)
    test_data, test_size = data_obj.getDataset(save_dir=cfg.DATASET.ROOT_DIR,
                                               isTrain=False,
                                               isDownload=True)

    print(
        "\nDataset {} Loaded Sucessfully.\nTotal Train Size: {} and Total Test Size: {}\n"
        .format(cfg.DATASET.NAME, train_size, test_size))
    logger.info(
        "Dataset {} Loaded Sucessfully. Total Train Size: {} and Total Test Size: {}\n"
        .format(cfg.DATASET.NAME, train_size, test_size))

    lSet_path, uSet_path, valSet_path = data_obj.makeLUVSets(train_split_ratio=cfg.ACTIVE_LEARNING.INIT_L_RATIO, \
        val_split_ratio=cfg.DATASET.VAL_RATIO, data=train_data, seed_id=cfg.RNG_SEED, save_dir=cfg.EXP_DIR)

    cfg.ACTIVE_LEARNING.LSET_PATH = lSet_path
    cfg.ACTIVE_LEARNING.USET_PATH = uSet_path
    cfg.ACTIVE_LEARNING.VALSET_PATH = valSet_path

    lSet, uSet, valSet = data_obj.loadPartitions(lSetPath=cfg.ACTIVE_LEARNING.LSET_PATH, \
            uSetPath=cfg.ACTIVE_LEARNING.USET_PATH, valSetPath = cfg.ACTIVE_LEARNING.VALSET_PATH)

    print(
        "Data Partitioning Complete. \nLabeled Set: {}, Unlabeled Set: {}, Validation Set: {}\n"
        .format(len(lSet), len(uSet), len(valSet)))
    logger.info(
        "Labeled Set: {}, Unlabeled Set: {}, Validation Set: {}\n".format(
            len(lSet), len(uSet), len(valSet)))

    # Preparing dataloaders for initial training
    lSet_loader = data_obj.getIndexesDataLoader(
        indexes=lSet, batch_size=cfg.TRAIN.BATCH_SIZE, data=train_data)
    valSet_loader = data_obj.getIndexesDataLoader(
        indexes=valSet, batch_size=cfg.TRAIN.BATCH_SIZE, data=train_data)
    test_loader = data_obj.getTestLoader(data=test_data,
                                         test_batch_size=cfg.TRAIN.BATCH_SIZE,
                                         seed_id=cfg.RNG_SEED)

    # Initialize the model.
    model = model_builder.build_model(cfg)
    print("model: {}\n".format(cfg.MODEL.TYPE))
    logger.info("model: {}\n".format(cfg.MODEL.TYPE))

    # Construct the optimizer
    optimizer = optim.construct_optimizer(cfg, model)
    print("optimizer: {}\n".format(optimizer))
    logger.info("optimizer: {}\n".format(optimizer))

    print("AL Query Method: {}\nMax AL Episodes: {}\n".format(
        cfg.ACTIVE_LEARNING.SAMPLING_FN, cfg.ACTIVE_LEARNING.MAX_ITER))
    logger.info("AL Query Method: {}\nMax AL Episodes: {}\n".format(
        cfg.ACTIVE_LEARNING.SAMPLING_FN, cfg.ACTIVE_LEARNING.MAX_ITER))

    for cur_episode in range(0, cfg.ACTIVE_LEARNING.MAX_ITER + 1):

        print("======== EPISODE {} BEGINS ========\n".format(cur_episode))
        logger.info(
            "======== EPISODE {} BEGINS ========\n".format(cur_episode))

        # Creating output directory for the episode
        episode_dir = os.path.join(cfg.EXP_DIR, f'episode_{cur_episode}')
        if not os.path.exists(episode_dir):
            os.mkdir(episode_dir)
        cfg.EPISODE_DIR = episode_dir

        # Train model
        print("======== TRAINING ========")
        logger.info("======== TRAINING ========")

        best_val_acc, best_val_epoch, checkpoint_file = train_model(
            lSet_loader, valSet_loader, model, optimizer, cfg)

        print("Best Validation Accuracy: {}\nBest Epoch: {}\n".format(
            round(best_val_acc, 4), best_val_epoch))
        logger.info(
            "EPISODE {} Best Validation Accuracy: {}\tBest Epoch: {}\n".format(
                cur_episode, round(best_val_acc, 4), best_val_epoch))

        # Test best model checkpoint
        print("======== TESTING ========\n")
        logger.info("======== TESTING ========\n")
        test_acc = test_model(test_loader, checkpoint_file, cfg, cur_episode)
        print("Test Accuracy: {}.\n".format(round(test_acc, 4)))
        logger.info("EPISODE {} Test Accuracy {}.\n".format(
            cur_episode, test_acc))

        # No need to perform active sampling in the last episode iteration
        if cur_episode == cfg.ACTIVE_LEARNING.MAX_ITER:
            # Save current lSet, uSet in the final episode directory
            data_obj.saveSet(lSet, 'lSet', cfg.EPISODE_DIR)
            data_obj.saveSet(uSet, 'uSet', cfg.EPISODE_DIR)
            break

        # Active Sample
        print("======== ACTIVE SAMPLING ========\n")
        logger.info("======== ACTIVE SAMPLING ========\n")
        al_obj = ActiveLearning(data_obj, cfg)
        clf_model = model_builder.build_model(cfg)
        clf_model = cu.load_checkpoint(checkpoint_file, clf_model)
        activeSet, new_uSet = al_obj.sample_from_uSet(clf_model, lSet, uSet,
                                                      train_data)

        # Save current lSet, new_uSet and activeSet in the episode directory
        data_obj.saveSets(lSet, uSet, activeSet, cfg.EPISODE_DIR)

        # Add activeSet to lSet, save new_uSet as uSet and update dataloader for the next episode
        lSet = np.append(lSet, activeSet)
        uSet = new_uSet

        lSet_loader = data_obj.getIndexesDataLoader(
            indexes=lSet, batch_size=cfg.TRAIN.BATCH_SIZE, data=train_data)
        valSet_loader = data_obj.getIndexesDataLoader(
            indexes=valSet, batch_size=cfg.TRAIN.BATCH_SIZE, data=train_data)
        uSet_loader = data_obj.getSequentialDataLoader(
            indexes=uSet, batch_size=cfg.TRAIN.BATCH_SIZE, data=train_data)

        print(
            "Active Sampling Complete. After Episode {}:\nNew Labeled Set: {}, New Unlabeled Set: {}, Active Set: {}\n"
            .format(cur_episode, len(lSet), len(uSet), len(activeSet)))
        logger.info(
            "Active Sampling Complete. After Episode {}:\nNew Labeled Set: {}, New Unlabeled Set: {}, Active Set: {}\n"
            .format(cur_episode, len(lSet), len(uSet), len(activeSet)))
        print("================================\n\n")
        logger.info("================================\n\n")
예제 #13
0
def train_model():
    """Trains the model."""
    # Setup training/testing environment
    setup_env()
    # Construct the model, loss_fun, and optimizer
    model = setup_model()
    loss_fun = builders.build_loss_fun().cuda()
    optimizer = optim.construct_optimizer(model)
    # Load checkpoint or initial weights
    start_epoch = 0
    if cfg.TRAIN.AUTO_RESUME and checkpoint.has_checkpoint():
        last_checkpoint = checkpoint.get_last_checkpoint()
        checkpoint_epoch = checkpoint.load_checkpoint(last_checkpoint, model,
                                                      optimizer)
        logger.info("Loaded checkpoint from: {}".format(last_checkpoint))
        start_epoch = checkpoint_epoch + 1
    elif cfg.TRAIN.WEIGHTS:
        checkpoint.load_checkpoint(cfg.TRAIN.WEIGHTS, model)
        logger.info("Loaded initial weights from: {}".format(
            cfg.TRAIN.WEIGHTS))
    # Create data loaders and meters
    if cfg.TEST.DATASET == 'imagenet_dataset' or cfg.TRAIN.DATASET == 'imagenet_dataset':
        dataset = loader.construct_train_loader()
        train_loader = dataset.train_loader
        test_loader = dataset.val_loader
    else:
        dataset = None
        train_loader = loader.construct_train_loader()
        test_loader = loader.construct_test_loader()
    train_meter = meters.TrainMeter(len(train_loader))
    test_meter = meters.TestMeter(len(test_loader))
    # Compute model and loader timings
    if start_epoch == 0 and cfg.PREC_TIME.NUM_ITER > 0:
        benchmark.compute_time_full(model, loss_fun, train_loader, test_loader)
    # Perform the training loop
    logger.info("Start epoch: {}".format(start_epoch + 1))
    for cur_epoch in range(start_epoch, cfg.OPTIM.MAX_EPOCH):
        # Train for one epoch
        train_epoch(train_loader, model, loss_fun, optimizer, train_meter,
                    cur_epoch)
        # Compute precise BN stats
        if cfg.BN.USE_PRECISE_STATS:
            net.compute_precise_bn_stats(model, train_loader)
        # Save a checkpoint
        if (cur_epoch + 1) % cfg.TRAIN.CHECKPOINT_PERIOD == 0:
            checkpoint_file = checkpoint.save_checkpoint(
                model, optimizer, cur_epoch)
            logger.info("Wrote checkpoint to: {}".format(checkpoint_file))
        # Evaluate the model
        next_epoch = cur_epoch + 1
        if next_epoch % cfg.TRAIN.EVAL_PERIOD == 0 or next_epoch == cfg.OPTIM.MAX_EPOCH:
            logger.info("Start testing")
            test_epoch(test_loader, model, test_meter, cur_epoch)
        if dataset is not None:
            logger.info("Reset the dataset")
            train_loader._dali_iterator.reset()
            test_loader._dali_iterator.reset()
            # clear memory
            if torch.cuda.is_available():
                torch.cuda.synchronize()
                torch.cuda.empty_cache(
                )  # https://forums.fast.ai/t/clearing-gpu-memory-pytorch/14637
            gc.collect()
예제 #14
0
def main(cfg):
    # Setting up GPU args
    use_cuda = (cfg.NUM_GPUS > 0) and torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")
    kwargs = {
        'num_workers': cfg.DATA_LOADER.NUM_WORKERS,
        'pin_memory': cfg.DATA_LOADER.PIN_MEMORY
    } if use_cuda else {}

    # Using specific GPU
    os.environ['NVIDIA_VISIBLE_DEVICES'] = str(cfg.GPU_ID)
    os.environ['CUDA_VISIBLE_DEVICES'] = '0'
    print("Using GPU : {}.\n".format(cfg.GPU_ID))

    # Getting the output directory ready (default is "/output")
    cfg.OUT_DIR = os.path.join(os.path.abspath('..'), cfg.OUT_DIR)
    if not os.path.exists(cfg.OUT_DIR):
        os.makedirs(cfg.OUT_DIR)
    # Create "DATASET" specific directory
    dataset_out_dir = os.path.join(cfg.OUT_DIR, cfg.DATASET.NAME,
                                   cfg.MODEL.TYPE)
    if not os.path.exists(dataset_out_dir):
        os.makedirs(dataset_out_dir)
    # Creating the experiment directory inside the dataset specific directory
    # all logs, labeled, unlabeled, validation sets are stroed here
    # E.g., output/CIFAR10/resnet18/{timestamp or cfg.EXP_NAME based on arguments passed}
    if cfg.EXP_NAME == 'auto':
        now = datetime.now()
        exp_dir = f'{now.year}_{now.month}_{now.day}_{now.hour}{now.minute}{now.second}'
    else:
        exp_dir = cfg.EXP_NAME

    exp_dir = os.path.join(dataset_out_dir, exp_dir)
    if not os.path.exists(exp_dir):
        os.mkdir(exp_dir)
        print("Experiment Directory is {}.\n".format(exp_dir))
    else:
        print(
            "Experiment Directory Already Exists: {}. Reusing it may lead to loss of old logs in the directory.\n"
            .format(exp_dir))
    cfg.EXP_DIR = exp_dir

    # Setup Logger
    lu.setup_logging(cfg)

    # Dataset preparing steps
    print("\n======== PREPARING DATA AND MODEL ========\n")
    cfg.DATASET.ROOT_DIR = os.path.join(os.path.abspath('..'),
                                        cfg.DATASET.ROOT_DIR)
    data_obj = Data(cfg)
    train_data, train_size = data_obj.getDataset(save_dir=cfg.DATASET.ROOT_DIR,
                                                 isTrain=True,
                                                 isDownload=True)
    train_data = RotNetDataset(cfg.DATASET.NAME, train_data)
    train_size = len(train_data)

    print("\n Rotation Dataset {} Loaded Sucessfully.\nTotal Train Size: {}\n".
          format(cfg.DATASET.NAME, train_size))
    logger.info(
        "Rotation Dataset {} Loaded Sucessfully. Total Train Size: {}\n".
        format(cfg.DATASET.NAME, train_size))

    trainSet_path, valSet_path = data_obj.makeTVSets(val_split_ratio=cfg.DATASET.VAL_RATIO, data=train_data,\
                                 seed_id=cfg.RNG_SEED, save_dir=cfg.EXP_DIR)

    cfg.INIT_POOL.TRAINSET_PATH = trainSet_path
    cfg.INIT_POOL.VALSET_PATH = valSet_path

    trainSet, valSet = data_obj.loadTVPartitions(
        trainSetPath=cfg.INIT_POOL.TRAINSET_PATH,
        valSetPath=cfg.INIT_POOL.VALSET_PATH)

    print("Data Partitioning Complete. \nTrain Set: {}, Validation Set: {}\n".
          format(len(trainSet), len(valSet)))
    logger.info("Train Set: {}, Validation Set: {}\n".format(
        len(trainSet), len(valSet)))

    # Preparing dataloaders for initial training
    trainSet_loader = data_obj.getSequentialDataLoader(
        indexes=trainSet, batch_size=cfg.TRAIN.BATCH_SIZE, data=train_data)
    valSet_loader = data_obj.getSequentialDataLoader(
        indexes=valSet, batch_size=cfg.TRAIN.BATCH_SIZE, data=train_data)

    # Initialize the model.
    model = model_builder.build_model(cfg)
    print("model: {}\n".format(cfg.MODEL.TYPE))
    logger.info("model: {}\n".format(cfg.MODEL.TYPE))

    # Construct the optimizer
    optimizer = optim.construct_optimizer(cfg, model)
    print("optimizer: {}\n".format(optimizer))
    logger.info("optimizer: {}\n".format(optimizer))

    # This is to seamlessly use the code originally written for AL episodes
    cfg.EPISODE_DIR = cfg.EXP_DIR

    # Train model
    print("======== ROTATION TRAINING ========")
    logger.info("======== ROTATION TRAINING ========")

    best_val_acc, best_val_epoch, checkpoint_file = train_model(
        trainSet_loader, valSet_loader, model, optimizer, cfg)

    print("Best Validation Accuracy: {}\nBest Epoch: {}\n".format(
        round(best_val_acc, 4), best_val_epoch))
    logger.info("Best Validation Accuracy: {}\tBest Epoch: {}\n".format(
        round(best_val_acc, 4), best_val_epoch))

    # Test best model checkpoint
    print("======== ROTATION TESTING ========\n")
    logger.info("======== ROTATION TESTING ========\n")

    test_acc = test_model(trainSet_loader, checkpoint_file, cfg, cur_episode=1)
    print("Test Accuracy: {}.\n".format(round(test_acc, 4)))
    logger.info("Test Accuracy {}.\n".format(test_acc))

    print("================================\n\n")
    logger.info("================================\n\n")
예제 #15
0
def main(cfg):

    # Login to wandb
    wandb.login()

    # Initialize a new wandb run
    wandb.init(project="rotation-pred", name=cfg.EXP_NAME)

    # Setting up GPU args
    use_cuda = (cfg.NUM_GPUS > 0) and torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")
    kwargs = {
        'num_workers': cfg.DATA_LOADER.NUM_WORKERS,
        'pin_memory': cfg.DATA_LOADER.PIN_MEMORY
    } if use_cuda else {}

    # Using specific GPU
    os.environ['NVIDIA_VISIBLE_DEVICES'] = str(cfg.GPU_ID)
    os.environ['CUDA_VISIBLE_DEVICES'] = '0'
    print("Using GPU : {}.\n".format(cfg.GPU_ID))

    # Getting the output directory ready (default is "/output")
    cfg.OUT_DIR = os.path.join(os.path.abspath('..'), cfg.OUT_DIR)
    if not os.path.exists(cfg.OUT_DIR):
        os.makedirs(cfg.OUT_DIR)
    # Create "DATASET" specific directory
    dataset_out_dir = os.path.join(cfg.OUT_DIR, cfg.DATASET.NAME,
                                   cfg.MODEL.TYPE)
    if not os.path.exists(dataset_out_dir):
        os.makedirs(dataset_out_dir)
    # Creating the experiment directory inside the dataset specific directory
    # all logs, labeled, unlabeled, validation sets are stroed here
    # E.g., output/CIFAR10/resnet18/{timestamp or cfg.EXP_NAME based on arguments passed}
    if cfg.EXP_NAME == 'auto':
        now = datetime.now()
        exp_dir = f'{now.year}_{now.month}_{now.day}_{now.hour}{now.minute}{now.second}'
    else:
        exp_dir = cfg.EXP_NAME

    exp_dir = os.path.join(dataset_out_dir, exp_dir)
    if not os.path.exists(exp_dir):
        os.mkdir(exp_dir)
        print("Experiment Directory is {}.\n".format(exp_dir))
    else:
        print(
            "Experiment Directory Already Exists: {}. Reusing it may lead to loss of old logs in the directory.\n"
            .format(exp_dir))
    cfg.EXP_DIR = exp_dir

    # Setup Logger
    lu.setup_logging(cfg)

    # Dataset preparing steps
    print("\n======== PREPARING DATA AND SSL EVALUATION MODEL ========\n")
    cfg.DATASET.ROOT_DIR = os.path.join(os.path.abspath('..'),
                                        cfg.DATASET.ROOT_DIR)
    data_obj = Data(cfg)
    train_data, train_size = data_obj.getDataset(save_dir=cfg.DATASET.ROOT_DIR,
                                                 isTrain=True,
                                                 isDownload=True)
    trainSet = [i for i in range(train_size)]

    print("\n Rotation Dataset {} Loaded Sucessfully.\nTotal Train Size: {}\n".
          format(cfg.DATASET.NAME, train_size))
    logger.info(
        "Rotation Dataset {} Loaded Sucessfully. Total Train Size: {}\n".
        format(cfg.DATASET.NAME, train_size))

    trainSet_path = data_obj.saveSet(setArray=trainSet,
                                     setName='trainSet',
                                     save_dir=cfg.EXP_DIR)
    trainSet = data_obj.loadPartition(setPath=trainSet_path)

    # Preparing dataloaders for initial training
    trainSet_loader = data_obj.getSequentialDataLoader(
        indexes=trainSet, batch_size=cfg.TRAIN.BATCH_SIZE, data=train_data)

    # Initialize the evaluation model
    if cfg.MODEL.TYPE == 'linear':
        model = SSLEvaluator(n_input=cfg.MODEL.NUM_INPUT,
                             n_classes=cfg.MODEL.NUM_OUTPUT,
                             n_hidden=None)
    else:
        model = SSLEvaluator(n_input=cfg.MODEL.NUM_INPUT,
                             n_classes=cfg.MODEL.NUM_OUTPUT,
                             n_hidden=cfg.MODEL.NUM_HIDDEN)
    print("Evaluation model: {}\n".format(cfg.MODEL.EVAL))
    logger.info("Evalution model: {}\n".format(cfg.MODEL.EVAL))

    # Initialize the SSL model
    ssl_model = model_builder.build_model(cfg)
    ssl_checkpoint_file = os.path.join(os.path.abspath('..'),
                                       cfg.TEST.MODEL_PATH)
    ssl_model = cu.load_checkpoint(ssl_checkpoint_file, ssl_model)

    # Construct the optimizer
    optimizer = optim.construct_optimizer(cfg, model)
    print("optimizer: {}\n".format(optimizer))
    logger.info("optimizer: {}\n".format(optimizer))

    # This is to seamlessly use the code originally written for AL episodes
    cfg.EPISODE_DIR = cfg.EXP_DIR

    # Train model
    print("======== EVALUATOR TRAINING ========")
    logger.info("======== EVALUATOR TRAINING ========")

    _, _, eval_checkpoint_file = train_model(trainSet_loader, None, model,
                                             ssl_model, optimizer, cfg)

    # eval_checkpoint_file = os.path.join(os.path.abspath('..'), '')

    # Test best model checkpoint
    print("======== EVALUATOR TESTING ========\n")
    logger.info("======== EVALUATOR TESTING ========\n")

    test_acc = test_model(trainSet_loader,
                          eval_checkpoint_file,
                          ssl_checkpoint_file,
                          cfg,
                          cur_episode=1)
    print("Test Accuracy: {}.\n".format(round(test_acc, 4)))
    logger.info("Test Accuracy {}.\n".format(test_acc))

    print("================================\n\n")
    logger.info("================================\n\n")