def test_model(test_loader, eval_checkpoint_file, ssl_checkpoint_file, cfg, cur_episode): global plot_epoch_xvalues global plot_epoch_yvalues global plot_it_x_values global plot_it_y_values test_meter = TestMeter(len(test_loader)) if cfg.MODEL.TYPE == 'linear': model = SSLEvaluator(n_input=cfg.MODEL.NUM_INPUT, n_classes=cfg.MODEL.NUM_OUTPUT, n_hidden=None) else: model = SSLEvaluator(n_input=cfg.MODEL.NUM_INPUT, n_classes=cfg.MODEL.NUM_OUTPUT, n_hidden=cfg.MODEL.NUM_HIDDEN) model = cu.load_checkpoint(eval_checkpoint_file, model) ssl_model = model_builder.build_model(cfg) ssl_model = cu.load_checkpoint(ssl_checkpoint_file, ssl_model) test_err = test_epoch(test_loader, model, ssl_model, test_meter, cur_episode) test_acc = 100. - test_err return test_acc
def setup_model(): """Sets up a model for training or testing and log the results.""" # Build the model model = builders.build_model() logger.info("Model:\n{}".format(model)) # Log model complexity # logger.info(logging.dump_log_data(net.complexity(model), "complexity")) if cfg.TASK == "seg" and cfg.TRAIN.DATASET == "cityscapes": h, w = 1025, 2049 else: h, w = cfg.TRAIN.IM_SIZE, cfg.TRAIN.IM_SIZE if cfg.TASK == "jig": x = torch.randn(1, cfg.JIGSAW_GRID ** 2, cfg.MODEL.INPUT_CHANNELS, h, w) else: x = torch.randn(1, cfg.MODEL.INPUT_CHANNELS, h, w) macs, params = profile(model, inputs=(x, ), verbose=False) logger.info("Params: {:,}".format(params)) logger.info("Flops: {:,}".format(macs)) # Transfer the model to the current GPU device err_str = "Cannot use more GPU devices than available" assert cfg.NUM_GPUS <= torch.cuda.device_count(), err_str cur_device = torch.cuda.current_device() model = model.cuda(device=cur_device) # Use multi-process data parallel model in the multi-gpu setting if cfg.NUM_GPUS > 1: # Make model replica operate on the current device model = torch.nn.parallel.DistributedDataParallel( module=model, device_ids=[cur_device], output_device=cur_device ) # Set complexity function to be module's complexity function # model.complexity = model.module.complexity return model
def test_model(test_loader, checkpoint_file, cfg, cur_episode): global plot_episode_xvalues global plot_episode_yvalues global plot_epoch_xvalues global plot_epoch_yvalues global plot_it_x_values global plot_it_y_values test_meter = TestMeter(len(test_loader)) model = model_builder.build_model(cfg) model = cu.load_checkpoint(checkpoint_file, model) test_err = test_epoch(test_loader, model, test_meter, cur_episode) test_acc = 100. - test_err plot_episode_xvalues.append(cur_episode) plot_episode_yvalues.append(test_acc) plot_arrays(x_vals=plot_episode_xvalues, y_vals=plot_episode_yvalues, \ x_name="Episodes", y_name="Test Accuracy", dataset_name=cfg.DATASET.NAME, out_dir=cfg.EXP_DIR) save_plot_values([plot_episode_xvalues, plot_episode_yvalues], \ ["plot_episode_xvalues", "plot_episode_yvalues"], out_dir=cfg.EXP_DIR) return test_acc
def ensemble_test_model(test_loader, checkpoint_file, cfg, cur_episode): test_meter = TestMeter(len(test_loader)) model = model_builder.build_model(cfg) model = cu.load_checkpoint(checkpoint_file, model) test_err = test_epoch(test_loader, model, test_meter, cur_episode) test_acc = 100. - test_err return test_acc
def main(): config.load_cfg_fom_args("Train a classification model.") config.assert_and_infer_cfg() cfg.freeze() print("building model {}".format(cfg.MODEL.TYPE)) model = build_model() model.eval() x = torch.randn(1, 3, 224, 224) y = model(x) print(y.shape) model_complex = complexity(model) print(model_complex)
def predict(args): cfg.MODEL.TYPE = "regnet" cfg.REGNET.DEPTH = 25 cfg.REGNET.SE_ON = False cfg.REGNET.W0 = 112 cfg.REGNET.WA = 33.22 cfg.REGNET.WM = 2.27 cfg.REGNET.GROUP_W = 72 cfg.BN.NUM_GROUPS = 4 cfg.ANYNET.STEM_CHANNELS = 1 cfg.MODEL.NUM_CLASSES = 10958 net = builders.build_model() net.load_state_dict(torch.load(args.classify_model, map_location="cpu")) net.eval() softmax = nn.Softmax(dim=1) label_map = load_label_file() # Load audio file to np.array audio, sr = librosa.load(args.sound_file, mono=True, offset=1.1, sr=CFG.sample_rate) logmel = librosa.feature.melspectrogram(audio, sr, n_mels=CFG.n_mels, fmax=CFG.fmax) S_dB = librosa.power_to_db(logmel, ref=np.max) aug = augment.Augment(training=False) segs = S_dB.shape[1] // SEGMENT_SIZE for index in range(segs): begin = index * SEGMENT_SIZE end = begin + SEGMENT_SIZE if end > S_dB.shape[1]: print(f"{end} is out of range {S_dB.shape[1]} [{args.sound_file}]") continue sample = S_dB[:, begin:end].copy() sample = torch.from_numpy(sample) sample = sample.unsqueeze(0).unsqueeze(3) sample = aug(sample) sample = sample.permute(0, 3, 1, 2).float() result = net(sample) result = softmax(result) values, indices = torch.topk(result, 5) print("-----------------------------------------------") for ind, val in zip(indices[0], values[0]): ind = ind.item() # if ind > 0 and ind < 10950: print(ind, label_map[ind], f"({val.item()*100:.2f}%)")
def __init__(self, num_classes=1, ckpt=None): super(Regnet, self).__init__() from pycls.core.config import cfg import pycls.core.config as model_config from pycls.core.builders import build_model model_config.load_cfg_fom_args("Train a cls model") cfg.freeze() model = build_model() if ckpt: model.load_state_dict(torch.load(ckpt)['model_state']) in_features = model.head.fc.in_features fc = nn.Linear(in_features, num_classes) self.model = model self.model.head.fc = fc
def build_model(name, pretrained=False, cfg_list=()): """Constructs a predefined model (note: loads global config as well).""" # Load the config reset_cfg() config_file = get_config_file(name) cfg.merge_from_file(config_file) cfg.merge_from_list(cfg_list) # Construct model model = builders.build_model() # Load pretrained weights if pretrained: weights_file = get_weights_file(name) cp.load_checkpoint(weights_file, model) return model
def test_model(test_loader, checkpoint_file, cfg, cur_episode): global plot_epoch_xvalues global plot_epoch_yvalues global plot_it_x_values global plot_it_y_values test_meter = TestMeter(len(test_loader)) model = model_builder.build_model(cfg) model = cu.load_checkpoint(checkpoint_file, model) test_err = test_epoch(test_loader, model, test_meter, cur_episode) test_acc = 100. - test_err return test_acc
def setup_model(): """Sets up a model for training or testing and log the results.""" # Build the model model = builders.build_model() logger.info("Model:\n{}".format(model)) if cfg.VERBOSE else () # Log model complexity logger.info(logging.dump_log_data(net.complexity(model), "complexity")) # Transfer the model to the current GPU device err_str = "Cannot use more GPU devices than available" assert cfg.NUM_GPUS <= torch.cuda.device_count(), err_str cur_device = torch.cuda.current_device() model = model.cuda(device=cur_device) # Use multi-process data parallel model in the multi-gpu setting if cfg.NUM_GPUS > 1: # Make model replica operate on the current device ddp = torch.nn.parallel.DistributedDataParallel model = ddp(module=model, device_ids=[cur_device], output_device=cur_device) return model
def __init__(self, num_clusters, num_tiles, num_classes, ckpt): super().__init__() from pycls.core.config import cfg import pycls.core.config as model_config from pycls.core.builders import build_model model_config.load_cfg_fom_args("Train a cls model") cfg.freeze() model = build_model() if ckpt: model.load_state_dict(torch.load(ckpt)['model_state']) self.enc = nn.Sequential(model.stem, model.s1, model.s2, model.s3, model.s4, nn.AdaptiveAvgPool2d(output_size=(1, 1)), nn.Flatten(), nn.Dropout(p=0.3)) self.nc = model.head.fc.in_features self.netvlad = NetVLAD(cluster_size=num_clusters, max_frames=num_tiles, feature_size=self.nc, truncate=False) self.fc = nn.Linear(num_clusters * self.nc, num_classes)
def test_model(): """Evaluates the model.""" # Setup logging logging.setup_logging() # Show the config logger.info("Config:\n{}".format(cfg)) # Fix the RNG seeds (see RNG comment in core/config.py for discussion) np.random.seed(cfg.RNG_SEED) torch.manual_seed(cfg.RNG_SEED) # Configure the CUDNN backend torch.backends.cudnn.benchmark = cfg.CUDNN.BENCHMARK # Build the model (before the loaders to speed up debugging) model = builders.build_model() logger.info("Model:\n{}".format(model)) logger.info(logging.dump_json_stats(net.complexity(model))) # Compute precise time if cfg.PREC_TIME.ENABLED: logger.info("Computing precise time...") loss_fun = builders.build_loss_fun() prec_time = net.compute_precise_time(model, loss_fun) logger.info(logging.dump_json_stats(prec_time)) net.reset_bn_stats(model) # Load model weights checkpoint.load_checkpoint(cfg.TEST.WEIGHTS, model) logger.info("Loaded model weights from: {}".format(cfg.TEST.WEIGHTS)) # Create data loaders test_loader = loader.construct_test_loader() # Create meters test_meter = meters.TestMeter(len(test_loader)) # Evaluate the model test_epoch(test_loader, model, test_meter, 0)
def setup_model(): """Sets up a model for training or testing and log the results.""" # Build the model model = builders.build_model() logger.info("Model:\n{}".format(model)) # Log model complexity logger.info(logging.dump_json_stats(net.complexity(model))) # Transfer the model to the current GPU device err_str = "Cannot use more GPU devices than available" assert cfg.NUM_GPUS <= torch.cuda.device_count(), err_str cur_device = torch.cuda.current_device() model = model.cuda(device=cur_device) # Use multi-process data parallel model in the multi-gpu setting if cfg.NUM_GPUS > 1: # Make model replica operate on the current device model = torch.nn.parallel.DistributedDataParallel( module=model, device_ids=[cur_device], output_device=cur_device, find_unused_parameters=True) # Set complexity function to be module's complexity function model.complexity = model.module.complexity return model
def setup_model(): """Sets up a model for training or testing and log the results.""" # Build the model model = builders.build_model() logger.info("Model:\n{}".format(model)) if cfg.VERBOSE else () # Log model complexity logger.info(logging.dump_log_data(net.complexity(model), "complexity")) # Transfer the model to the current GPU device err_str = "Cannot use more GPU devices than available" #assert cfg.NUM_GPUS <= torch.cuda.device_count(), err_str assert cfg.NUM_GPUS <= torch.npu.device_count(), err_str cur_device = torch.npu.current_device() model = model.to(cur_device) optimizer = optim.construct_optimizer(model) model, optimizer = amp.initialize(model, optimizer, opt_level="O2", loss_scale=128) if cfg.NUM_GPUS > 1: #Make model replica operate on the current device ddp = torch.nn.parallel.DistributedDataParallel model = ddp(model, device_ids=[cur_device], broadcast_buffers=False) return model, optimizer
def train_model(): """Trains the model.""" # Setup logging logging.setup_logging() # Show the config logger.info("Config:\n{}".format(cfg)) # Fix the RNG seeds (see RNG comment in core/config.py for discussion) np.random.seed(cfg.RNG_SEED) torch.manual_seed(cfg.RNG_SEED) # Configure the CUDNN backend torch.backends.cudnn.benchmark = cfg.CUDNN.BENCHMARK # Build the model (before the loaders to speed up debugging) model = builders.build_model() logger.info("Model:\n{}".format(model)) logger.info(logging.dump_json_stats(net.complexity(model))) # Define the loss function loss_fun = builders.build_loss_fun() # Construct the optimizer optimizer = optim.construct_optimizer(model) # Load checkpoint or initial weights start_epoch = 0 if cfg.TRAIN.AUTO_RESUME and checkpoint.has_checkpoint(): last_checkpoint = checkpoint.get_last_checkpoint() checkpoint_epoch = checkpoint.load_checkpoint(last_checkpoint, model, optimizer) logger.info("Loaded checkpoint from: {}".format(last_checkpoint)) start_epoch = checkpoint_epoch + 1 elif cfg.TRAIN.WEIGHTS: checkpoint.load_checkpoint(cfg.TRAIN.WEIGHTS, model) logger.info("Loaded initial weights from: {}".format( cfg.TRAIN.WEIGHTS)) # Compute precise time if start_epoch == 0 and cfg.PREC_TIME.ENABLED: logger.info("Computing precise time...") prec_time = net.compute_precise_time(model, loss_fun) logger.info(logging.dump_json_stats(prec_time)) net.reset_bn_stats(model) # Create data loaders train_loader = loader.construct_train_loader() test_loader = loader.construct_test_loader() # Create meters train_meter = meters.TrainMeter(len(train_loader)) test_meter = meters.TestMeter(len(test_loader)) # Perform the training loop logger.info("Start epoch: {}".format(start_epoch + 1)) for cur_epoch in range(start_epoch, cfg.OPTIM.MAX_EPOCH): # Train for one epoch train_epoch(train_loader, model, loss_fun, optimizer, train_meter, cur_epoch) # Compute precise BN stats if cfg.BN.USE_PRECISE_STATS: net.compute_precise_bn_stats(model, train_loader) # Save a checkpoint if checkpoint.is_checkpoint_epoch(cur_epoch): checkpoint_file = checkpoint.save_checkpoint( model, optimizer, cur_epoch) logger.info("Wrote checkpoint to: {}".format(checkpoint_file)) # Evaluate the model if is_eval_epoch(cur_epoch): test_epoch(test_loader, model, test_meter, cur_epoch)
def main(cfg): # Login to wandb wandb.login() # Initialize a new wandb run wandb.init(project="rotation-pred", name=cfg.EXP_NAME) # Setting up GPU args use_cuda = (cfg.NUM_GPUS > 0) and torch.cuda.is_available() device = torch.device("cuda" if use_cuda else "cpu") kwargs = { 'num_workers': cfg.DATA_LOADER.NUM_WORKERS, 'pin_memory': cfg.DATA_LOADER.PIN_MEMORY } if use_cuda else {} # Using specific GPU os.environ['NVIDIA_VISIBLE_DEVICES'] = str(cfg.GPU_ID) os.environ['CUDA_VISIBLE_DEVICES'] = '0' print("Using GPU : {}.\n".format(cfg.GPU_ID)) # Getting the output directory ready (default is "/output") cfg.OUT_DIR = os.path.join(os.path.abspath('..'), cfg.OUT_DIR) if not os.path.exists(cfg.OUT_DIR): os.makedirs(cfg.OUT_DIR) # Create "DATASET" specific directory dataset_out_dir = os.path.join(cfg.OUT_DIR, cfg.DATASET.NAME, cfg.MODEL.TYPE) if not os.path.exists(dataset_out_dir): os.makedirs(dataset_out_dir) # Creating the experiment directory inside the dataset specific directory # all logs, labeled, unlabeled, validation sets are stroed here # E.g., output/CIFAR10/resnet18/{timestamp or cfg.EXP_NAME based on arguments passed} if cfg.EXP_NAME == 'auto': now = datetime.now() exp_dir = f'{now.year}_{now.month}_{now.day}_{now.hour}{now.minute}{now.second}' else: exp_dir = cfg.EXP_NAME exp_dir = os.path.join(dataset_out_dir, exp_dir) if not os.path.exists(exp_dir): os.mkdir(exp_dir) print("Experiment Directory is {}.\n".format(exp_dir)) else: print( "Experiment Directory Already Exists: {}. Reusing it may lead to loss of old logs in the directory.\n" .format(exp_dir)) cfg.EXP_DIR = exp_dir # Setup Logger lu.setup_logging(cfg) # Dataset preparing steps print("\n======== PREPARING DATA AND SSL EVALUATION MODEL ========\n") cfg.DATASET.ROOT_DIR = os.path.join(os.path.abspath('..'), cfg.DATASET.ROOT_DIR) data_obj = Data(cfg) train_data, train_size = data_obj.getDataset(save_dir=cfg.DATASET.ROOT_DIR, isTrain=True, isDownload=True) trainSet = [i for i in range(train_size)] print("\n Rotation Dataset {} Loaded Sucessfully.\nTotal Train Size: {}\n". format(cfg.DATASET.NAME, train_size)) logger.info( "Rotation Dataset {} Loaded Sucessfully. Total Train Size: {}\n". format(cfg.DATASET.NAME, train_size)) trainSet_path = data_obj.saveSet(setArray=trainSet, setName='trainSet', save_dir=cfg.EXP_DIR) trainSet = data_obj.loadPartition(setPath=trainSet_path) # Preparing dataloaders for initial training trainSet_loader = data_obj.getSequentialDataLoader( indexes=trainSet, batch_size=cfg.TRAIN.BATCH_SIZE, data=train_data) # Initialize the evaluation model if cfg.MODEL.TYPE == 'linear': model = SSLEvaluator(n_input=cfg.MODEL.NUM_INPUT, n_classes=cfg.MODEL.NUM_OUTPUT, n_hidden=None) else: model = SSLEvaluator(n_input=cfg.MODEL.NUM_INPUT, n_classes=cfg.MODEL.NUM_OUTPUT, n_hidden=cfg.MODEL.NUM_HIDDEN) print("Evaluation model: {}\n".format(cfg.MODEL.EVAL)) logger.info("Evalution model: {}\n".format(cfg.MODEL.EVAL)) # Initialize the SSL model ssl_model = model_builder.build_model(cfg) ssl_checkpoint_file = os.path.join(os.path.abspath('..'), cfg.TEST.MODEL_PATH) ssl_model = cu.load_checkpoint(ssl_checkpoint_file, ssl_model) # Construct the optimizer optimizer = optim.construct_optimizer(cfg, model) print("optimizer: {}\n".format(optimizer)) logger.info("optimizer: {}\n".format(optimizer)) # This is to seamlessly use the code originally written for AL episodes cfg.EPISODE_DIR = cfg.EXP_DIR # Train model print("======== EVALUATOR TRAINING ========") logger.info("======== EVALUATOR TRAINING ========") _, _, eval_checkpoint_file = train_model(trainSet_loader, None, model, ssl_model, optimizer, cfg) # eval_checkpoint_file = os.path.join(os.path.abspath('..'), '') # Test best model checkpoint print("======== EVALUATOR TESTING ========\n") logger.info("======== EVALUATOR TESTING ========\n") test_acc = test_model(trainSet_loader, eval_checkpoint_file, ssl_checkpoint_file, cfg, cur_episode=1) print("Test Accuracy: {}.\n".format(round(test_acc, 4))) logger.info("Test Accuracy {}.\n".format(test_acc)) print("================================\n\n") logger.info("================================\n\n")
def train(args, train_loader, eval_loader): cfg.MODEL.TYPE = "regnet" cfg.REGNET.DEPTH = 25 cfg.REGNET.SE_ON = False cfg.REGNET.W0 = 112 cfg.REGNET.WA = 33.22 cfg.REGNET.WM = 2.27 cfg.REGNET.GROUP_W = 72 cfg.BN.NUM_GROUPS = 4 cfg.ANYNET.STEM_CHANNELS = 1 cfg.MODEL.NUM_CLASSES = config["num_classes"] net = builders.build_model() net = net.cuda(device=torch.cuda.current_device()) print("net", net) if args.resume: print("Resuming training, loading {}...".format(args.resume)) ckpt_file = ( config["save_folder"] + config["ckpt_name"] + "_" + str(args.resume) + ".pth" ) net.load_state_dict(torch.load(ckpt_file)) if args.finetune: print("Finetuning......") # Freeze all layers for param in net.parameters(): param.requires_grad = False # Unfreeze some layers for layer in [net.s4.b1, net.s4.b2]: for param in layer.parameters(): param.requies_grad = True net.head.fc.weight.requires_grad = True optimizer = optim.SGD( filter(lambda param: param.requires_grad, net.parameters()), lr=args.lr, momentum=args.momentum, nesterov=False, ) else: optimizer = optim.SGD( net.parameters(), lr=args.lr, momentum=args.momentum, nesterov=False, ) scheduler = ReduceLROnPlateau( optimizer, "max", factor=0.5, patience=1, verbose=True, threshold=1e-3, threshold_mode="abs", ) aug = augment.Augment().cuda() if args.fp16: import apex.amp as amp net, optimizer = amp.initialize(net, optimizer, opt_level="O2") batch_iterator = iter(train_loader) sum_accuracy = 0 step = 0 config["eval_period"] = len(train_loader.dataset) // args.batch_size config["verbose_period"] = config["eval_period"] // 5 train_start_time = time.time() for iteration in range( args.resume + 1, args.max_epoch * len(train_loader.dataset) // args.batch_size, ): t0 = time.time() try: sounds, type_ids = next(batch_iterator) except StopIteration: batch_iterator = iter(train_loader) sounds, type_ids = next(batch_iterator) except Exception as ex: print("Loading data exception:", ex) if torch.cuda.is_available(): sounds = Variable(sounds.cuda()) type_ids = Variable(type_ids.cuda()) else: sounds = Variable(sounds) type_ids = Variable(type_ids) sounds = sounds.unsqueeze(3) sounds = sounds.permute(0, 3, 1, 2).float() if torch.cuda.is_available(): one_hot = torch.cuda.FloatTensor( type_ids.shape[0], config["num_classes"] ) else: one_hot = torch.FloatTensor( type_ids.shape[0], config["num_classes"] ) one_hot.fill_(0.5 / (config["num_classes"] - 1)) one_hot.scatter_(1, type_ids.unsqueeze(1), 0.5) # augmentation sounds = aug(sounds) # forward out = net(sounds) # backprop optimizer.zero_grad() loss = torch.sum(-one_hot * F.log_softmax(out, -1), -1).mean() # loss = F.cross_entropy(out, type_ids) if args.fp16: import apex.amp as amp with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() nn.utils.clip_grad_norm_(net.parameters(), max_norm=20, norm_type=2) optimizer.step() t1 = time.time() if iteration % config["verbose_period"] == 0: # accuracy _, predict = torch.max(out, 1) correct = (predict == type_ids) accuracy = correct.sum().item() / correct.size()[0] print( "iter: %d loss: %.4f | acc: %.4f | time: %.4f sec." % (iteration, loss.item(), accuracy, (t1 - t0)), flush=True, ) sum_accuracy += accuracy step += 1 warmup_steps = config["verbose_period"] if iteration < warmup_steps: warmup_learning_rate(optimizer, iteration, warmup_steps) if ( iteration % config["eval_period"] == 0 and iteration != 0 and step != 0 ): with torch.no_grad(): loss, accuracy = evaluate(net, eval_loader) hours = int(time.time() - train_start_time) // 3600 now = datetime.datetime.now().strftime("%Y-%m-%d %H:%M") print( "[{}] [{}] Eval accuracy:{:6f} | Train accuracy:{:6f}".format( now, hours, accuracy, sum_accuracy / step ), flush=True, ) scheduler.step(accuracy) sum_accuracy = 0 step = 0 if iteration % config["eval_period"] == 0 and iteration != 0: # save checkpoint print("Saving state, iter:", iteration, flush=True) save_ckpt(net, iteration) # final checkpoint save_ckpt(net, iteration)
def main(cfg): # Setting up GPU args use_cuda = (cfg.NUM_GPUS > 0) and torch.cuda.is_available() device = torch.device("cuda" if use_cuda else "cpu") kwargs = { 'num_workers': cfg.DATA_LOADER.NUM_WORKERS, 'pin_memory': cfg.DATA_LOADER.PIN_MEMORY } if use_cuda else {} # Using specific GPU os.environ['CUDA_VISIBLE_DEVICES'] = str(cfg.GPU_ID) print("Using GPU : {}.\n".format(cfg.GPU_ID)) # Getting the output directory ready (default is "/output") cfg.OUT_DIR = os.path.join(os.path.abspath('..'), cfg.OUT_DIR) if not os.path.exists(cfg.OUT_DIR): os.mkdir(cfg.OUT_DIR) # Create "DATASET" specific directory dataset_out_dir = os.path.join(cfg.OUT_DIR, cfg.DATASET.NAME) if not os.path.exists(dataset_out_dir): os.mkdir(dataset_out_dir) # Creating the experiment directory inside the dataset specific directory # all logs, labeled, unlabeled, validation sets are stroed here # E.g., output/CIFAR10/{timestamp or cfg.EXP_NAME based on arguments passed} if cfg.EXP_NAME == 'auto': now = datetime.now() exp_dir = f'{now.year}_{now.month}_{now.day}_{now.hour}{now.minute}{now.second}' else: exp_dir = cfg.EXP_NAME exp_dir = os.path.join(dataset_out_dir, exp_dir) if not os.path.exists(exp_dir): os.mkdir(exp_dir) print("Experiment Directory is {}.\n".format(exp_dir)) else: print( "Experiment Directory Already Exists: {}. Reusing it may lead to loss of old logs in the directory.\n" .format(exp_dir)) cfg.EXP_DIR = exp_dir # Setup Logger lu.setup_logging(cfg) # Dataset preparing steps print("\n======== PREPARING DATA AND MODEL ========\n") cfg.DATASET.ROOT_DIR = os.path.join(os.path.abspath('..'), cfg.DATASET.ROOT_DIR) data_obj = Data(cfg) train_data, train_size = data_obj.getDataset(save_dir=cfg.DATASET.ROOT_DIR, isTrain=True, isDownload=True) test_data, test_size = data_obj.getDataset(save_dir=cfg.DATASET.ROOT_DIR, isTrain=False, isDownload=True) print( "\nDataset {} Loaded Sucessfully.\nTotal Train Size: {} and Total Test Size: {}\n" .format(cfg.DATASET.NAME, train_size, test_size)) logger.info( "Dataset {} Loaded Sucessfully. Total Train Size: {} and Total Test Size: {}\n" .format(cfg.DATASET.NAME, train_size, test_size)) trainSet_path, valSet_path = data_obj.makeTVSets(train_split_ratio=cfg.ACTIVE_LEARNING.INIT_RATIO, \ val_split_ratio=cfg.DATASET.VAL_RATIO, data=train_data, seed_id=cfg.RNG_SEED, save_dir=cfg.EXP_DIR) trainSet, valSet = data_obj.loadTVPartitions(trainSetPath=trainSet_path, valSetPath=valSet_path) print("Data Partitioning Complete. \nTrain Set: {}, Validation Set: {}\n". format(len(trainSet), len(valSet))) logger.info("\nTrain Set: {}, Validation Set: {}\n".format( len(trainSet), len(valSet))) # Preparing dataloaders for initial training trainSet_loader = data_obj.getIndexesDataLoader( indexes=trainSet, batch_size=cfg.TRAIN.BATCH_SIZE, data=train_data) valSet_loader = data_obj.getIndexesDataLoader( indexes=valSet, batch_size=cfg.TRAIN.BATCH_SIZE, data=train_data) test_loader = data_obj.getTestLoader(data=test_data, test_batch_size=cfg.TRAIN.BATCH_SIZE, seed_id=cfg.RNG_SEED) # Initialize the models num_ensembles = cfg.ENSEMBLE.NUM_MODELS models = [] for i in range(num_ensembles): models.append(model_builder.build_model(cfg)) print("{} ensemble models of type: {}\n".format(cfg.ENSEMBLE.NUM_MODELS, cfg.ENSEMBLE.MODEL_TYPE)) logger.info("{} ensemble models of type: {}\n".format( cfg.ENSEMBLE.NUM_MODELS, cfg.ENSEMBLE.MODEL_TYPE)) # This is to seamlessly use the code originally written for AL episodes cfg.EPISODE_DIR = cfg.EXP_DIR # Train models print("======== ENSEMBLE TRAINING ========") logger.info("======== ENSEMBLE TRAINING ========") best_model_paths = [] test_accs = [] for i in range(num_ensembles): print("=== Training ensemble [{}/{}] ===".format(i + 1, num_ensembles)) # Construct the optimizer optimizer = optim.construct_optimizer(cfg, models[i]) print("optimizer: {}\n".format(optimizer)) logger.info("optimizer: {}\n".format(optimizer)) # Each ensemble gets its own output directory cfg.EPISODE_DIR = os.path.join(cfg.EPISODE_DIR, 'model_{} '.format(i + 1)) # Train the model best_val_acc, best_val_epoch, checkpoint_file = ensemble_train_model( trainSet_loader, valSet_loader, models[i], optimizer, cfg) best_model_paths.append(checkpoint_file) print("Best Validation Accuracy by Model {}: {}\nBest Epoch: {}\n". format(i + 1, round(best_val_acc, 4), best_val_epoch)) logger.info( "Best Validation Accuracy by Model {}: {}\tBest Epoch: {}\n". format(i + 1, round(best_val_acc, 4), best_val_epoch)) # Test the model print("=== Testing ensemble [{}/{}] ===".format(i + 1, num_ensembles)) test_acc = ensemble_test_model(test_loader, checkpoint_file, cfg, cur_episode=0) test_accs.append(test_acc) print("Test Accuracy by Model {}: {}.\n".format( i + 1, round(test_acc, 4))) logger.info("Test Accuracy by Model {}: {}.\n".format(i + 1, test_acc)) # Reset EPISODE_DIR cfg.EPISODE_DIR = cfg.EXP_DIR # Test each best model checkpoint and report the average print("======== ENSEMBLE TESTING ========\n") logger.info("======== ENSEMBLE TESTING ========\n") mean_test_acc = np.mean(test_accs) print("Average Ensemble Test Accuracy: {}.\n".format( round(mean_test_acc, 4))) logger.info("Average Ensemble Test Accuracy: {}.\n".format(mean_test_acc)) print("================================\n\n") logger.info("================================\n\n")
def main(cfg): # Setting up GPU args use_cuda = (cfg.NUM_GPUS > 0) and torch.cuda.is_available() device = torch.device("cuda" if use_cuda else "cpu") kwargs = { 'num_workers': cfg.DATA_LOADER.NUM_WORKERS, 'pin_memory': cfg.DATA_LOADER.PIN_MEMORY } if use_cuda else {} # Using specific GPU os.environ['CUDA_VISIBLE_DEVICES'] = str(cfg.GPU_ID) print("Using GPU : {}.\n".format(cfg.GPU_ID)) # Getting the output directory ready (default is "/output") cfg.OUT_DIR = os.path.join(os.path.abspath('..'), cfg.OUT_DIR) if not os.path.exists(cfg.OUT_DIR): os.mkdir(cfg.OUT_DIR) # Create "DATASET" specific directory dataset_out_dir = os.path.join(cfg.OUT_DIR, cfg.DATASET.NAME, cfg.MODEL.TYPE) if not os.path.exists(dataset_out_dir): os.makedirs(dataset_out_dir) # Creating the experiment directory inside the dataset specific directory # all logs, labeled, unlabeled, validation sets are stroed here # E.g., output/CIFAR10/resnet18/{timestamp or cfg.EXP_NAME based on arguments passed} if cfg.EXP_NAME == 'auto': now = datetime.now() exp_dir = f'{now.year}_{now.month}_{now.day}_{now.hour}{now.minute}{now.second}' else: exp_dir = cfg.EXP_NAME exp_dir = os.path.join(dataset_out_dir, exp_dir) if not os.path.exists(exp_dir): os.mkdir(exp_dir) print("Experiment Directory is {}.\n".format(exp_dir)) else: print( "Experiment Directory Already Exists: {}. Reusing it may lead to loss of old logs in the directory.\n" .format(exp_dir)) cfg.EXP_DIR = exp_dir # Setup Logger lu.setup_logging(cfg) # Dataset preparing steps print("\n======== PREPARING DATA AND MODEL ========\n") cfg.DATASET.ROOT_DIR = os.path.join(os.path.abspath('..'), cfg.DATASET.ROOT_DIR) data_obj = Data(cfg) train_data, train_size = data_obj.getDataset(save_dir=cfg.DATASET.ROOT_DIR, isTrain=True, isDownload=True) test_data, test_size = data_obj.getDataset(save_dir=cfg.DATASET.ROOT_DIR, isTrain=False, isDownload=True) print( "\nDataset {} Loaded Sucessfully.\nTotal Train Size: {} and Total Test Size: {}\n" .format(cfg.DATASET.NAME, train_size, test_size)) logger.info( "Dataset {} Loaded Sucessfully. Total Train Size: {} and Total Test Size: {}\n" .format(cfg.DATASET.NAME, train_size, test_size)) print("\nSampling Initial Pool using {}.".format( str.upper(cfg.INIT_POOL.SAMPLING_FN))) logger.info("\nSampling Initial Pool using {}.".format( str.upper(cfg.INIT_POOL.SAMPLING_FN))) if cfg.INIT_POOL.SAMPLING_FN == 'random': lSet_path, uSet_path, valSet_path = data_obj.makeLUVSets(train_split_ratio=cfg.INIT_POOL.INIT_RATIO, \ val_split_ratio=cfg.DATASET.VAL_RATIO, data=train_data, seed_id=cfg.RNG_SEED, save_dir=cfg.EXP_DIR) else: lSet, uSet = InitialPool(cfg).sample_from_uSet(train_data) lSet_path = f'{cfg.EXP_DIR}/lSet.npy' np.save(lSet_path, lSet) np.save(f'{cfg.EXP_DIR}/lSet_initial.npy', lSet) uSet_path, valSet_path = data_obj.makeUVSets( val_split_ratio=cfg.DATASET.VAL_RATIO, data=uSet, seed_id=cfg.RNG_SEED, save_dir=cfg.EXP_DIR) cfg.ACTIVE_LEARNING.LSET_PATH = lSet_path cfg.ACTIVE_LEARNING.USET_PATH = uSet_path cfg.ACTIVE_LEARNING.VALSET_PATH = valSet_path lSet, uSet, valSet = data_obj.loadPartitions(lSetPath=cfg.ACTIVE_LEARNING.LSET_PATH, \ uSetPath=cfg.ACTIVE_LEARNING.USET_PATH, valSetPath = cfg.ACTIVE_LEARNING.VALSET_PATH) print( "Data Partitioning Complete. \nLabeled Set: {}, Unlabeled Set: {}, Validation Set: {}\n" .format(len(lSet), len(uSet), len(valSet))) logger.info( "Labeled Set: {}, Unlabeled Set: {}, Validation Set: {}\n".format( len(lSet), len(uSet), len(valSet))) # Preparing dataloaders for initial training lSet_loader = data_obj.getIndexesDataLoader( indexes=lSet, batch_size=cfg.TRAIN.BATCH_SIZE, data=train_data) valSet_loader = data_obj.getIndexesDataLoader( indexes=valSet, batch_size=cfg.TRAIN.BATCH_SIZE, data=train_data) uSet_loader = data_obj.getIndexesDataLoader( indexes=uSet, batch_size=cfg.TRAIN.BATCH_SIZE, data=train_data) test_loader = data_obj.getTestLoader(data=test_data, test_batch_size=cfg.TRAIN.BATCH_SIZE, seed_id=cfg.RNG_SEED) # Initialize the models num_ensembles = cfg.ENSEMBLE.NUM_MODELS models = [] for i in range(num_ensembles): models.append(model_builder.build_model(cfg)) print("{} ensemble models of type: {}\n".format(cfg.ENSEMBLE.NUM_MODELS, cfg.ENSEMBLE.MODEL_TYPE)) logger.info("{} ensemble models of type: {}\n".format( cfg.ENSEMBLE.NUM_MODELS, cfg.ENSEMBLE.MODEL_TYPE)) print("Max AL Episodes: {}\n".format(cfg.ACTIVE_LEARNING.MAX_ITER)) logger.info("Max AL Episodes: {}\n".format(cfg.ACTIVE_LEARNING.MAX_ITER)) for cur_episode in range(0, cfg.ACTIVE_LEARNING.MAX_ITER + 1): wandb.log({"Episode": cur_episode}) print("======== EPISODE {} BEGINS ========\n".format(cur_episode)) logger.info( "======== EPISODE {} BEGINS ========\n".format(cur_episode)) # Creating output directory for the episode episode_dir = os.path.join(cfg.EXP_DIR, f'episode_{cur_episode}') if not os.path.exists(episode_dir): os.mkdir(episode_dir) cfg.EPISODE_DIR = episode_dir # Train models print("======== ENSEMBLE TRAINING ========") logger.info("======== ENSEMBLE TRAINING ========") best_model_paths = [] test_accs = [] for i in range(num_ensembles): print("=== Training ensemble [{}/{}] ===".format( i + 1, num_ensembles)) # Construct the optimizer optimizer = optim.construct_optimizer(cfg, models[i]) print("optimizer: {}\n".format(optimizer)) logger.info("optimizer: {}\n".format(optimizer)) # Each ensemble gets its own output directory cfg.EPISODE_DIR = os.path.join(cfg.EPISODE_DIR, 'model_{}'.format(i + 1)) # Train the model best_val_acc, best_val_epoch, checkpoint_file = ensemble_train_model( lSet_loader, valSet_loader, models[i], optimizer, cfg) best_model_paths.append(checkpoint_file) print("Best Validation Accuracy by Model {}: {}\nBest Epoch: {}\n". format(i + 1, round(best_val_acc, 4), best_val_epoch)) logger.info( "EPISODE {} Best Validation Accuracy by Model {}: {}\tBest Epoch: {}\n" .format(cur_episode, i + 1, round(best_val_acc, 4), best_val_epoch)) # Test the model print("=== Testing ensemble [{}/{}] ===".format( i + 1, num_ensembles)) test_acc = ensemble_test_model(test_loader, checkpoint_file, cfg, cur_episode) test_accs.append(test_acc) print("Test Accuracy by Model {}: {}.\n".format( i + 1, round(test_acc, 4))) logger.info("EPISODE {} Test Accuracy by Model {}: {}.\n".format( cur_episode, i + 1, test_acc)) # Reset EPISODE_DIR cfg.EPISODE_DIR = episode_dir # Test each best model checkpoint and report the average print("======== ENSEMBLE TESTING ========\n") logger.info("======== ENSEMBLE TESTING ========\n") mean_test_acc = np.mean(test_accs) print("Average Ensemble Test Accuracy: {}.\n".format( round(mean_test_acc, 4))) logger.info("EPISODE {} Average Ensemble Test Accuracy: {}.\n".format( cur_episode, mean_test_acc)) wandb.log({"Test Accuracy": mean_test_acc}) global plot_episode_xvalues global plot_episode_yvalues global plot_epoch_xvalues global plot_epoch_yvalues global plot_it_x_values global plot_it_y_values plot_episode_xvalues.append(cur_episode) plot_episode_yvalues.append(mean_test_acc) plot_arrays(x_vals=plot_episode_xvalues, y_vals=plot_episode_yvalues, \ x_name="Episodes", y_name="Test Accuracy", dataset_name=cfg.DATASET.NAME, out_dir=cfg.EXP_DIR) save_plot_values([plot_episode_xvalues, plot_episode_yvalues], \ ["plot_episode_xvalues", "plot_episode_yvalues"], out_dir=cfg.EXP_DIR, saveInTextFormat=True) # No need to perform active sampling in the last episode iteration if cur_episode == cfg.ACTIVE_LEARNING.MAX_ITER: break # Active Sample print("======== ENSEMBLE ACTIVE SAMPLING ========\n") logger.info("======== ENSEMBLE ACTIVE SAMPLING ========\n") al_obj = ActiveLearning(data_obj, cfg) clf_models = [] for i in range(num_ensembles): temp = model_builder.build_model(cfg) clf_models.append(cu.load_checkpoint(best_model_paths[i], temp)) activeSet, new_uSet = al_obj.sample_from_uSet( None, lSet, uSet, train_data, supportingModels=clf_models) # Save current lSet, new_uSet and activeSet in the episode directory data_obj.saveSets(lSet, uSet, activeSet, cfg.EPISODE_DIR) # Add activeSet to lSet, save new_uSet as uSet and update dataloader for the next episode lSet = np.append(lSet, activeSet) uSet = new_uSet lSet_loader = data_obj.getIndexesDataLoader( indexes=lSet, batch_size=cfg.TRAIN.BATCH_SIZE, data=train_data) valSet_loader = data_obj.getIndexesDataLoader( indexes=valSet, batch_size=cfg.TRAIN.BATCH_SIZE, data=train_data) uSet_loader = data_obj.getSequentialDataLoader( indexes=uSet, batch_size=cfg.TRAIN.BATCH_SIZE, data=train_data) print( "Ensemble Active Sampling Complete. After Episode {}:\nNew Labeled Set: {}, New Unlabeled Set: {}, Active Set: {}\n" .format(cur_episode, len(lSet), len(uSet), len(activeSet))) logger.info( "Ensemble Active Sampling Complete. After Episode {}:\nNew Labeled Set: {}, New Unlabeled Set: {}, Active Set: {}\n" .format(cur_episode, len(lSet), len(uSet), len(activeSet))) print("================================\n\n") logger.info("================================\n\n")
def train(args, train_loader, eval_loader): cfg.MODEL.TYPE = "regnet" cfg.REGNET.DEPTH = 20 cfg.REGNET.SE_ON = False cfg.REGNET.W0 = 512 cfg.MODEL.NUM_CLASSES = config["num_classes"] net = model_builder.build_model() net = net.cuda(device=torch.cuda.current_device()) print("net", net) if args.resume: print("Resuming training, loading {}...".format(args.resume)) ckpt_file = (config["save_folder"] + config["ckpt_name"] + "_" + str(args.resume) + ".pth") net.load_state_dict(torch.load(ckpt_file)) if args.finetune: print("Finetuning......") # Freeze all layers for param in net.parameters(): param.requires_grad = False # Unfreeze some layers for layer in [net.s1.b18, net.s1.b19, net.s1.b20]: for param in layer.parameters(): param.requies_grad = True net.head.fc.weight.requires_grad = True optimizer = optim.SGD( filter(lambda param: param.requires_grad, net.parameters()), lr=args.lr, momentum=args.momentum, nesterov=False, ) else: optimizer = optim.SGD( net.parameters(), lr=args.lr, momentum=args.momentum, nesterov=False, ) scheduler = ReduceLROnPlateau( optimizer, "max", factor=0.5, patience=2, verbose=True, threshold=1e-3, threshold_mode="abs", ) if args.fp16: net, optimizer = amp.initialize(net, optimizer, opt_level="O2") aug = augmentations.Augmentations().cuda() batch_iterator = iter(train_loader) sum_accuracy = 0 step = 0 for iteration in range( args.resume + 1, args.max_epoch * len(train_loader.dataset) // args.batch_size, ): t0 = time.time() try: images, type_ids = next(batch_iterator) except StopIteration: batch_iterator = iter(train_loader) images, type_ids = next(batch_iterator) except Exception as e: print("Loading data exception:", e) images = Variable(images.cuda()).permute(0, 3, 1, 2).float() type_ids = Variable(type_ids.cuda()) one_hot = torch.cuda.FloatTensor(type_ids.shape[0], config["num_classes"]) one_hot.fill_((1 - 0.5) / config["num_classes"]) one_hot.scatter_(1, type_ids.unsqueeze(1), 0.5) # augmentation if not args.finetune: images = aug(images) # forward out = net(images) loss = (torch.sum(-one_hot * F.log_softmax(out, -1), -1).mean() / args.iter_size) if args.fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() nn.utils.clip_grad_norm_(net.parameters(), max_norm=20, norm_type=2) if iteration != 0 and iteration % args.iter_size == 0: # backprop optimizer.step() optimizer.zero_grad() t1 = time.time() if iteration % config["verbose_period"] == 0: # accuracy _, predict = torch.max(out, 1) correct = predict == type_ids accuracy = correct.sum().item() / correct.size()[0] print( "iter: %d loss: %.4f | acc: %.4f | time: %.4f sec." % (iteration, loss.item(), accuracy, (t1 - t0)), flush=True, ) sum_accuracy += accuracy step += 1 warmup_steps = config["verbose_period"] * 8 * args.iter_size if iteration < warmup_steps: warmup_learning_rate(optimizer, iteration, warmup_steps) if (iteration % config["eval_period"] == 0 and iteration != 0 and step != 0): loss, accuracy = evaluate(net, eval_loader) now = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") print( f"[{now}] Eval accuracy: {accuracy:.4f} | Train accuracy: {sum_accuracy/step:.4f}", flush=True, ) scheduler.step(accuracy) sum_accuracy = 0 step = 0 if iteration % config["save_period"] == 0 and iteration != 0: # save checkpoint print("Saving state, iter:", iteration, flush=True) save_ckpt(net, iteration) # final checkpoint save_ckpt(net, iteration)
def main(cfg): # Setting up GPU args use_cuda = (cfg.NUM_GPUS > 0) and torch.cuda.is_available() device = torch.device("cuda" if use_cuda else "cpu") kwargs = { 'num_workers': cfg.DATA_LOADER.NUM_WORKERS, 'pin_memory': cfg.DATA_LOADER.PIN_MEMORY } if use_cuda else {} # Auto assign a RNG_SEED when not supplied a value if cfg.RNG_SEED is None: cfg.RNG_SEED = np.random.randint(100) # Using specific GPU # os.environ['NVIDIA_VISIBLE_DEVICES'] = str(cfg.GPU_ID) # os.environ['CUDA_VISIBLE_DEVICES'] = '0' # print("Using GPU : {}.\n".format(cfg.GPU_ID)) # Getting the output directory ready (default is "/output") cfg.OUT_DIR = os.path.join(os.path.abspath('..'), cfg.OUT_DIR) if not os.path.exists(cfg.OUT_DIR): os.mkdir(cfg.OUT_DIR) # Create "DATASET/MODEL TYPE" specific directory dataset_out_dir = os.path.join(cfg.OUT_DIR, cfg.DATASET.NAME, cfg.MODEL.TYPE) if not os.path.exists(dataset_out_dir): os.makedirs(dataset_out_dir) # Creating the experiment directory inside the dataset specific directory # all logs, labeled, unlabeled, validation sets are stroed here # E.g., output/CIFAR10/resnet18/{timestamp or cfg.EXP_NAME based on arguments passed} if cfg.EXP_NAME == 'auto': now = datetime.now() exp_dir = f'{now.year}_{now.month}_{now.day}_{now.hour}{now.minute}{now.second}' else: exp_dir = cfg.EXP_NAME exp_dir = os.path.join(dataset_out_dir, exp_dir) if not os.path.exists(exp_dir): os.mkdir(exp_dir) print("Experiment Directory is {}.\n".format(exp_dir)) else: print( "Experiment Directory Already Exists: {}. Reusing it may lead to loss of old logs in the directory.\n" .format(exp_dir)) cfg.EXP_DIR = exp_dir # Save the config file in EXP_DIR dump_cfg(cfg) # Setup Logger lu.setup_logging(cfg) # Dataset preparing steps print("\n======== PREPARING DATA AND MODEL ========\n") cfg.DATASET.ROOT_DIR = os.path.join(os.path.abspath('..'), cfg.DATASET.ROOT_DIR) data_obj = Data(cfg) train_data, train_size = data_obj.getDataset(save_dir=cfg.DATASET.ROOT_DIR, isTrain=True, isDownload=True) test_data, test_size = data_obj.getDataset(save_dir=cfg.DATASET.ROOT_DIR, isTrain=False, isDownload=True) print( "\nDataset {} Loaded Sucessfully.\nTotal Train Size: {} and Total Test Size: {}\n" .format(cfg.DATASET.NAME, train_size, test_size)) logger.info( "Dataset {} Loaded Sucessfully. Total Train Size: {} and Total Test Size: {}\n" .format(cfg.DATASET.NAME, train_size, test_size)) lSet_path, uSet_path, valSet_path = data_obj.makeLUVSets(train_split_ratio=cfg.ACTIVE_LEARNING.INIT_L_RATIO, \ val_split_ratio=cfg.DATASET.VAL_RATIO, data=train_data, seed_id=cfg.RNG_SEED, save_dir=cfg.EXP_DIR) cfg.ACTIVE_LEARNING.LSET_PATH = lSet_path cfg.ACTIVE_LEARNING.USET_PATH = uSet_path cfg.ACTIVE_LEARNING.VALSET_PATH = valSet_path lSet, uSet, valSet = data_obj.loadPartitions(lSetPath=cfg.ACTIVE_LEARNING.LSET_PATH, \ uSetPath=cfg.ACTIVE_LEARNING.USET_PATH, valSetPath = cfg.ACTIVE_LEARNING.VALSET_PATH) print( "Data Partitioning Complete. \nLabeled Set: {}, Unlabeled Set: {}, Validation Set: {}\n" .format(len(lSet), len(uSet), len(valSet))) logger.info( "Labeled Set: {}, Unlabeled Set: {}, Validation Set: {}\n".format( len(lSet), len(uSet), len(valSet))) # Preparing dataloaders for initial training lSet_loader = data_obj.getIndexesDataLoader( indexes=lSet, batch_size=cfg.TRAIN.BATCH_SIZE, data=train_data) valSet_loader = data_obj.getIndexesDataLoader( indexes=valSet, batch_size=cfg.TRAIN.BATCH_SIZE, data=train_data) test_loader = data_obj.getTestLoader(data=test_data, test_batch_size=cfg.TRAIN.BATCH_SIZE, seed_id=cfg.RNG_SEED) # Initialize the model. model = model_builder.build_model(cfg) print("model: {}\n".format(cfg.MODEL.TYPE)) logger.info("model: {}\n".format(cfg.MODEL.TYPE)) # Construct the optimizer optimizer = optim.construct_optimizer(cfg, model) print("optimizer: {}\n".format(optimizer)) logger.info("optimizer: {}\n".format(optimizer)) print("AL Query Method: {}\nMax AL Episodes: {}\n".format( cfg.ACTIVE_LEARNING.SAMPLING_FN, cfg.ACTIVE_LEARNING.MAX_ITER)) logger.info("AL Query Method: {}\nMax AL Episodes: {}\n".format( cfg.ACTIVE_LEARNING.SAMPLING_FN, cfg.ACTIVE_LEARNING.MAX_ITER)) for cur_episode in range(0, cfg.ACTIVE_LEARNING.MAX_ITER + 1): print("======== EPISODE {} BEGINS ========\n".format(cur_episode)) logger.info( "======== EPISODE {} BEGINS ========\n".format(cur_episode)) # Creating output directory for the episode episode_dir = os.path.join(cfg.EXP_DIR, f'episode_{cur_episode}') if not os.path.exists(episode_dir): os.mkdir(episode_dir) cfg.EPISODE_DIR = episode_dir # Train model print("======== TRAINING ========") logger.info("======== TRAINING ========") best_val_acc, best_val_epoch, checkpoint_file = train_model( lSet_loader, valSet_loader, model, optimizer, cfg) print("Best Validation Accuracy: {}\nBest Epoch: {}\n".format( round(best_val_acc, 4), best_val_epoch)) logger.info( "EPISODE {} Best Validation Accuracy: {}\tBest Epoch: {}\n".format( cur_episode, round(best_val_acc, 4), best_val_epoch)) # Test best model checkpoint print("======== TESTING ========\n") logger.info("======== TESTING ========\n") test_acc = test_model(test_loader, checkpoint_file, cfg, cur_episode) print("Test Accuracy: {}.\n".format(round(test_acc, 4))) logger.info("EPISODE {} Test Accuracy {}.\n".format( cur_episode, test_acc)) # No need to perform active sampling in the last episode iteration if cur_episode == cfg.ACTIVE_LEARNING.MAX_ITER: # Save current lSet, uSet in the final episode directory data_obj.saveSet(lSet, 'lSet', cfg.EPISODE_DIR) data_obj.saveSet(uSet, 'uSet', cfg.EPISODE_DIR) break # Active Sample print("======== ACTIVE SAMPLING ========\n") logger.info("======== ACTIVE SAMPLING ========\n") al_obj = ActiveLearning(data_obj, cfg) clf_model = model_builder.build_model(cfg) clf_model = cu.load_checkpoint(checkpoint_file, clf_model) activeSet, new_uSet = al_obj.sample_from_uSet(clf_model, lSet, uSet, train_data) # Save current lSet, new_uSet and activeSet in the episode directory data_obj.saveSets(lSet, uSet, activeSet, cfg.EPISODE_DIR) # Add activeSet to lSet, save new_uSet as uSet and update dataloader for the next episode lSet = np.append(lSet, activeSet) uSet = new_uSet lSet_loader = data_obj.getIndexesDataLoader( indexes=lSet, batch_size=cfg.TRAIN.BATCH_SIZE, data=train_data) valSet_loader = data_obj.getIndexesDataLoader( indexes=valSet, batch_size=cfg.TRAIN.BATCH_SIZE, data=train_data) uSet_loader = data_obj.getSequentialDataLoader( indexes=uSet, batch_size=cfg.TRAIN.BATCH_SIZE, data=train_data) print( "Active Sampling Complete. After Episode {}:\nNew Labeled Set: {}, New Unlabeled Set: {}, Active Set: {}\n" .format(cur_episode, len(lSet), len(uSet), len(activeSet))) logger.info( "Active Sampling Complete. After Episode {}:\nNew Labeled Set: {}, New Unlabeled Set: {}, Active Set: {}\n" .format(cur_episode, len(lSet), len(uSet), len(activeSet))) print("================================\n\n") logger.info("================================\n\n")
def main(cfg): # Setting up GPU args use_cuda = (cfg.NUM_GPUS > 0) and torch.cuda.is_available() device = torch.device("cuda" if use_cuda else "cpu") kwargs = { 'num_workers': cfg.DATA_LOADER.NUM_WORKERS, 'pin_memory': cfg.DATA_LOADER.PIN_MEMORY } if use_cuda else {} # Using specific GPU os.environ['NVIDIA_VISIBLE_DEVICES'] = str(cfg.GPU_ID) os.environ['CUDA_VISIBLE_DEVICES'] = '0' print("Using GPU : {}.\n".format(cfg.GPU_ID)) # Getting the output directory ready (default is "/output") cfg.OUT_DIR = os.path.join(os.path.abspath('..'), cfg.OUT_DIR) if not os.path.exists(cfg.OUT_DIR): os.makedirs(cfg.OUT_DIR) # Create "DATASET" specific directory dataset_out_dir = os.path.join(cfg.OUT_DIR, cfg.DATASET.NAME, cfg.MODEL.TYPE) if not os.path.exists(dataset_out_dir): os.makedirs(dataset_out_dir) # Creating the experiment directory inside the dataset specific directory # all logs, labeled, unlabeled, validation sets are stroed here # E.g., output/CIFAR10/resnet18/{timestamp or cfg.EXP_NAME based on arguments passed} if cfg.EXP_NAME == 'auto': now = datetime.now() exp_dir = f'{now.year}_{now.month}_{now.day}_{now.hour}{now.minute}{now.second}' else: exp_dir = cfg.EXP_NAME exp_dir = os.path.join(dataset_out_dir, exp_dir) if not os.path.exists(exp_dir): os.mkdir(exp_dir) print("Experiment Directory is {}.\n".format(exp_dir)) else: print( "Experiment Directory Already Exists: {}. Reusing it may lead to loss of old logs in the directory.\n" .format(exp_dir)) cfg.EXP_DIR = exp_dir # Setup Logger lu.setup_logging(cfg) # Dataset preparing steps print("\n======== PREPARING DATA AND MODEL ========\n") cfg.DATASET.ROOT_DIR = os.path.join(os.path.abspath('..'), cfg.DATASET.ROOT_DIR) data_obj = Data(cfg) train_data, train_size = data_obj.getDataset(save_dir=cfg.DATASET.ROOT_DIR, isTrain=True, isDownload=True) train_data = RotNetDataset(cfg.DATASET.NAME, train_data) train_size = len(train_data) print("\n Rotation Dataset {} Loaded Sucessfully.\nTotal Train Size: {}\n". format(cfg.DATASET.NAME, train_size)) logger.info( "Rotation Dataset {} Loaded Sucessfully. Total Train Size: {}\n". format(cfg.DATASET.NAME, train_size)) trainSet_path, valSet_path = data_obj.makeTVSets(val_split_ratio=cfg.DATASET.VAL_RATIO, data=train_data,\ seed_id=cfg.RNG_SEED, save_dir=cfg.EXP_DIR) cfg.INIT_POOL.TRAINSET_PATH = trainSet_path cfg.INIT_POOL.VALSET_PATH = valSet_path trainSet, valSet = data_obj.loadTVPartitions( trainSetPath=cfg.INIT_POOL.TRAINSET_PATH, valSetPath=cfg.INIT_POOL.VALSET_PATH) print("Data Partitioning Complete. \nTrain Set: {}, Validation Set: {}\n". format(len(trainSet), len(valSet))) logger.info("Train Set: {}, Validation Set: {}\n".format( len(trainSet), len(valSet))) # Preparing dataloaders for initial training trainSet_loader = data_obj.getSequentialDataLoader( indexes=trainSet, batch_size=cfg.TRAIN.BATCH_SIZE, data=train_data) valSet_loader = data_obj.getSequentialDataLoader( indexes=valSet, batch_size=cfg.TRAIN.BATCH_SIZE, data=train_data) # Initialize the model. model = model_builder.build_model(cfg) print("model: {}\n".format(cfg.MODEL.TYPE)) logger.info("model: {}\n".format(cfg.MODEL.TYPE)) # Construct the optimizer optimizer = optim.construct_optimizer(cfg, model) print("optimizer: {}\n".format(optimizer)) logger.info("optimizer: {}\n".format(optimizer)) # This is to seamlessly use the code originally written for AL episodes cfg.EPISODE_DIR = cfg.EXP_DIR # Train model print("======== ROTATION TRAINING ========") logger.info("======== ROTATION TRAINING ========") best_val_acc, best_val_epoch, checkpoint_file = train_model( trainSet_loader, valSet_loader, model, optimizer, cfg) print("Best Validation Accuracy: {}\nBest Epoch: {}\n".format( round(best_val_acc, 4), best_val_epoch)) logger.info("Best Validation Accuracy: {}\tBest Epoch: {}\n".format( round(best_val_acc, 4), best_val_epoch)) # Test best model checkpoint print("======== ROTATION TESTING ========\n") logger.info("======== ROTATION TESTING ========\n") test_acc = test_model(trainSet_loader, checkpoint_file, cfg, cur_episode=1) print("Test Accuracy: {}.\n".format(round(test_acc, 4))) logger.info("Test Accuracy {}.\n".format(test_acc)) print("================================\n\n") logger.info("================================\n\n")