def train(name, model, training_data, validation_data, crit, optimizer, scheduler, opt): valid_aucs = [0.] for epoch_i in range(opt.epoch): print('[ Epoch', epoch_i, ']') start = time.time() train_loss, train_auc = train_epoch(model, training_data, crit, optimizer) print(' - (Training) loss: {loss: 8.5f}, auc: {auc:3.3f} %, '\ 'elapse: {elapse:3.3f} min'.format( loss=train_loss, auc=100*train_auc, elapse=(time.time()-start)/60)) start = time.time() valid_loss, valid_auc, valid_proba = eval_epoch(model, validation_data, crit) print(' - (Validation) loss: {loss: 8.5f}, auc: {auc:3.3f} %, '\ 'elapse: {elapse:3.3f} min'.format( loss=valid_loss, auc=100*valid_auc, elapse=(time.time()-start)/60)) best_loss = max(valid_aucs) valid_aucs += [valid_auc] scheduler.step(valid_loss) model_state_dict = model.state_dict() checkpoint = { 'model': model_state_dict, 'settings': opt, 'epoch': epoch_i, 'auc': valid_auc} model_name = name + '.chkpt' if valid_auc >= best_loss: print('new best loss:', valid_auc) best_proba = valid_proba best_model = model if opt.save_model: torch.save(checkpoint, 'models/'+model_name) print(' - [Info] The checkpoint file has been updated.') if opt.log: directory = 'predictions/' + opt.name log_train_file = directory + '/train.log' log_valid_file = directory + '/valid.log' with open(log_train_file, 'a') as log_tf, open(log_valid_file, 'a') as log_vf: log_tf.write('{fold},{epoch},{loss: 8.5f},{auc:3.3f}\n'.format( fold=name, epoch=epoch_i, loss=train_loss, auc=100*train_auc)) log_vf.write('{fold},{epoch},{loss: 8.5f},{auc:3.3f}\n'.format( fold=name, epoch=epoch_i, loss=valid_loss, auc=100*valid_auc)) return best_model, best_proba
def swa_train(model, swa_model, train_iter, valid_iter, optimizer, criterion, pretrain_epochs, swa_epochs, swa_lr, cycle_length, device, writer, cpt_filename): swa_n = 1 swa_model.load_state_dict(copy.deepcopy(model.state_dict())) utils.save_checkpoint( cpt_directory, 1, '{}-swa-{:2.4f}-{:03d}-{}'.format(date, swa_lr, cycle_length, cpt_filename), state_dict=model.state_dict(), swa_state_dict=swa_model.state_dict(), swa_n=swa_n, optimizer=optimizer.state_dict() ) for e in range(swa_epochs): epoch = e + pretrain_epochs time_ep = time.time() lr = utils.schedule(epoch, cycle_length, lr_init, swa_lr) utils.adjust_learning_rate(optimizer, lr) train_res = utils.train_epoch(model, train_iter, optimizer, criterion, device) valid_res = utils.evaluate(model, valid_iter, criterion, device) utils.moving_average(swa_model, model, swa_n) swa_n += 1 utils.bn_update(train_iter, swa_model) swa_res = utils.evaluate(swa_model, valid_iter, criterion, device) time_ep = time.time() - time_ep values = [epoch + 1, lr, swa_lr, cycle_length, train_res['loss'], valid_res['loss'], swa_res['loss'], None, None, time_ep] writer.writerow(values) table = tabulate.tabulate([values], columns, tablefmt='simple', floatfmt='8.4f') if epoch % 20 == 0: table = table.split('\n') table = '\n'.join([table[1]] + table) else: table = table.split('\n')[2] print(table) utils.save_checkpoint( cpt_directory, epoch + 1, '{}-swa-{:2.4f}-{:03d}-{}'.format(date, swa_lr, cycle_length, cpt_filename), state_dict=model.state_dict(), swa_state_dict=swa_model.state_dict(), swa_n=swa_n, optimizer=optimizer.state_dict() )
def train(model, train_iter, valid_iter, optimizer, criterion, pretrain_epochs, device, writer, cpt_filename): print(f'The model has {utils.count_parameters(model):,} trainable parameters') lr = lr_init utils.save_checkpoint( cpt_directory, 1, date + "-" + cpt_filename, state_dict=model.state_dict(), optimizer=optimizer.state_dict() ) for epoch in range(pretrain_epochs): time_ep = time.time() swa = epoch > pretrain_epochs train_res = utils.train_epoch(model, train_iter, optimizer, criterion, device) valid_res = utils.evaluate(model, valid_iter, criterion, device) time_ep = time.time() - time_ep values = [epoch + 1, lr, None, None, train_res['loss'], valid_res['loss'], None, None, None, time_ep] writer.writerow(values) table = tabulate.tabulate([values], columns, tablefmt='simple', floatfmt='8.4f') if epoch % 20 == 0: table = table.split('\n') table = '\n'.join([table[1]] + table) else: table = table.split('\n')[2] print(table) if (epoch + 1) % save_freq == 0 or swa: utils.save_checkpoint( cpt_directory, epoch + 1, date + "final-" + cpt_filename, state_dict=model.state_dict(), optimizer=optimizer.state_dict() )
train_ds = PAN2020(train_dir, tokenizer, seq_len, 3) test_ds = PAN2020(test_dir, tokenizer, seq_len, 3) train_dataloader = DataLoader( train_ds, sampler = RandomSampler(train_ds), batch_size = batch_size ) # train_dataloader = DataLoader( # test_ds, # sampler = RandomSampler(test_ds), # batch_size = batch_size # ) test_dataloader = DataLoader( test_ds, sampler = RandomSampler(test_ds), batch_size = batch_size ) optimizer = AdamW(optimizer_grouped_parameters, lr=lr) model.train() epochs_it = trange(epochs, desc='Epoch', mininterval=0) global_step = 0 for e in epochs_it: global_step = train_epoch(model, optimizer, train_dataloader, test_dataloader, device, tb_writer, global_step=global_step)
def main(): args = parser() torch.backends.cudnn.benchmark = True torch.manual_seed(args.seed) torch.cuda.manual_seed(args.seed) ###### # prepare model and dataset ###### print("Using model %s" % args.model) model_cfg = getattr(models, args.model) loaders, num_classes, num_data = generate_data(args, model_cfg) train_loader = loaders["train"] test_loader = loaders["test"] print("Preparing model") model = model_cfg.base(*model_cfg.args, num_classes=num_classes, **model_cfg.kwargs) model.cuda() ## please note that this code will only work if the checkpoints are saved as cuda tensors if args.resume is not None: print("Loading Model") checkpoint = torch.load(args.resume) start_epoch = checkpoint["epoch"] model.load_state_dict(checkpoint["state_dict"]) model.eval() pars = [] param_name_list = list(model.named_parameters()) for i, (n, p) in enumerate(param_name_list): if i < len(param_name_list) - 2: p.requires_grad = False else: pars.append(p) optimizer = torch.optim.Adam(pars, lr=args.lr_init, amsgrad=True) criterion = cross_entropy if args.epochs == 0: eval_dict = eval(loader=test_loader, model=model, criterion=criterion, verbose=True) print("Eval loss: {} Eval acc: {}".format(eval_dict["loss"], eval_dict["accuracy"])) for epoch in range(args.epochs): train_epoch( loader=train_loader, model=model, criterion=criterion, optimizer=optimizer, verbose=True, ) eval_dict = eval(loader=test_loader, model=model, criterion=criterion, verbose=True) print("Eval loss: {} Eval acc: {}".format(eval_dict["loss"], eval_dict["accuracy"])) if args.save_path is not None: print("Saving predictions to ", args.save_path) predictions_dict = predict(loader=test_loader, model=model, criterion=criterion, verbose=True) np.savez( args.save_path, predictions=predictions_dict["predictions"], targets=predictions_dict["targets"], )
BATCH_SIZE, rootdir=train_dir) net = get_resnet50(n_class=len(le.classes_)) criterion_train = nn.CrossEntropyLoss() criterion_val = nn.CrossEntropyLoss() optimizer = optim.Adam(net.fc.parameters(), lr=0.0001) # use default learning rate state = {'val_acc': [], 'lives': 4, 'best_val_acc': 0} if CUDA: net.cuda() for epoch in range(EPOCHS): print("Epoch: ", epoch + 1) train_acc = train_epoch(net, dog_train, criterion_train, optimizer, CUDA) print("Evaluating...") val_acc = val_epoch(net, dog_val, criterion_val, CUDA) state['val_acc'].append(val_acc) if val_acc > state['best_val_acc']: state['lives'] = 4 state['best_val_acc'] = val_acc else: state['lives'] -= 1 print("Trial left :", state['lives']) if state['lives'] == 2: optimizer.param_groups[0]['lr'] /= 2 if state['lives'] == 0: break
swa_res = {'loss': None, 'accuracy': None} utils.save_checkpoint( args.dir, start_epoch, state_dict=model.state_dict(), swa_state_dict=swa_model.state_dict() if args.swa else None, swa_n=swa_n if args.swa else None, optimizer=optimizer.state_dict()) for epoch in range(start_epoch, args.epochs): time_ep = time.time() lr = schedule(epoch) utils.adjust_learning_rate(optimizer, lr) train_res = utils.train_epoch(loaders['train'], model, criterion, optimizer) if epoch == 0 or epoch % args.eval_freq == args.eval_freq - 1 or epoch == args.epochs - 1: test_res = utils.eval(loaders['test'], model, criterion) else: test_res = {'loss': None, 'accuracy': None} if args.swa and (epoch + 1) >= args.swa_start and ( epoch + 1 - args.swa_start) % args.swa_c_epochs == 0: utils.moving_average(swa_model, model, 1.0 / (swa_n + 1)) swa_n += 1 if epoch == 0 or epoch % args.eval_freq == args.eval_freq - 1 or epoch == args.epochs - 1: utils.bn_update(loaders['train'], swa_model) swa_res = utils.eval(loaders['test'], swa_model, criterion) else: swa_res = {'loss': None, 'accuracy': None}
def main(device, tr_loader, va_loader, te_loader, modelSelection): """Train CNN and show training plots.""" # Model if modelSelection.lower() == 'res50': model = Res50() elif modelSelection.lower() == 'dense121': model = Dense121() elif modelSelection.lower() == 'dense161': model = Dense161() elif modelSelection.lower() == 'mobv2': model = Mob_v2() elif modelSelection.lower() == 'dense169': model = Dense169() elif modelSelection.lower() == 'mob': model = Net() elif modelSelection.lower() == 'squeeze': model = Squeeze() else: assert False, 'Wrong type of model selection string!' model = model.to(device) # TODO: define loss function, and optimizer learning_rate = utils.config(modelSelection + ".learning_rate") criterion = DepthLoss(0.1).to(device) optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate) number_of_epoches = 10 # # Attempts to restore the latest checkpoint if exists print("Loading unet...") model, start_epoch, stats = utils.restore_checkpoint( model, utils.config(modelSelection + ".checkpoint")) running_va_loss = [] if 'va_loss' not in stats else stats['va_loss'] running_va_acc = [] if 'va_err' not in stats else stats['va_err'] running_tr_loss = [] if 'tr_loss' not in stats else stats['tr_loss'] running_tr_acc = [] if 'tr_err' not in stats else stats['tr_err'] tr_acc, tr_loss = utils.evaluate_model(model, tr_loader, device) acc, loss = utils.evaluate_model(model, va_loader, device) running_va_acc.append(acc) running_va_loss.append(loss) running_tr_acc.append(tr_acc) running_tr_loss.append(tr_loss) stats = { 'va_err': running_va_acc, 'va_loss': running_va_loss, 'tr_err': running_tr_acc, 'tr_loss': running_tr_loss, } # Loop over the entire dataset multiple times # for epoch in range(start_epoch, config('cnn.num_epochs')): epoch = start_epoch # while curr_patience < patience: while epoch < number_of_epoches: # Train model utils.train_epoch(device, tr_loader, model, criterion, optimizer) # Save checkpoint utils.save_checkpoint(model, epoch + 1, utils.config(modelSelection + ".checkpoint"), stats) # Evaluate model tr_acc, tr_loss = utils.evaluate_model(model, tr_loader, device) va_acc, va_loss = utils.evaluate_model(model, va_loader, device) running_va_acc.append(va_acc) running_va_loss.append(va_loss) running_tr_acc.append(tr_acc) running_tr_loss.append(tr_loss) epoch += 1 print("Finished Training") utils.make_plot(running_tr_loss, running_tr_acc, running_va_loss, running_va_acc)
def main(): args = parser() torch.backends.cudnn.benchmark = True torch.manual_seed(args.seed) torch.cuda.manual_seed(args.seed) ###### # prepare model and dataset ###### print("Using model %s" % args.model) model_cfg = getattr(models, args.model) loaders, num_classes, num_data = generate_data(args, model_cfg) train_loader = loaders["train"] test_loader = loaders["test"] print("Preparing model") model = model_cfg.base(*model_cfg.args, num_classes=num_classes, **model_cfg.kwargs) model.cuda() ## please note that this code will only work if the checkpoints are saved as cuda tensors print("Loading Model") checkpoint = torch.load(args.resume) start_epoch = checkpoint["epoch"] model.load_state_dict(checkpoint["net"]) ####### # prepare linearized model by cloning parameters current_pars = finite_ntk.lazy.utils.flatten(model.parameters()) # but dont initialize to zero so we add a little bit of noise eps = 1e-6 pars = torch.clone(current_pars.data) + eps pars = pars.detach_().view(-1, 1) pars.requires_grad = True if args.inference == "vi": sigma_pars = -5.0 * torch.ones_like(pars) pars = [pars, sigma_pars] else: pars = [pars] optimizer = torch.optim.Adam(pars, lr=args.lr_init, amsgrad=True) # set model in eval mode to freeze batch norm and dropout model.eval() loss_args = [ model, num_classes, args.bias, args.wd, current_pars, num_data ] loss_instances = { "map": losses.map_crossentropy, "laplace": losses.laplace_crossentropy, "vi": losses.vi_crossentropy, } try: loss_func = loss_instances[args.inference] criterion = loss_func(*loss_args) eval_criterion = loss_func(*loss_args, eval=True) except: raise ValueError("Inference method not found") if args.epochs == 0: eval_dict = eval(loader=test_loader, model=pars, criterion=criterion, verbose=True) print("Eval loss: {} Eval acc: {}".format(eval_dict["loss"], eval_dict["accuracy"])) for epoch in range(args.epochs): train_epoch( loader=train_loader, model=pars, criterion=criterion, optimizer=optimizer, verbose=True, ) if epoch % args.eval_freq == 0: eval_dict = eval(loader=test_loader, model=pars, criterion=eval_criterion, verbose=True) print("Eval loss: {} Eval acc: {}".format(eval_dict["loss"], eval_dict["accuracy"])) if args.save_path is not None: print("Saving predictions to ", args.save_path) predictions_dict = predict(loader=test_loader, model=pars, criterion=eval_criterion, verbose=True) np.savez( args.save_path, weights=pars[0].detach().cpu().numpy(), predictions=predictions_dict["predictions"], targets=predictions_dict["targets"], )
def main(device, tr_loader, va_loader, te_loader, modelSelection): """Train CNN and show training plots.""" # CLI arguments # parser = arg.ArgumentParser(description='We all know what we are doing. Fighting!') # parser.add_argument("--datasize", "-d", default="small", type=str, # help="data size you want to use, small, medium, total") # Parsing # args = parser.parse_args() # Data loaders # datasize = args.datasize # Model if modelSelection.lower() == 'res50': model = Res50() elif modelSelection.lower() == 'dense121': model = Dense121() elif modelSelection.lower() == 'mobv2': model = Mob_v2() elif modelSelection.lower() == 'dense169': model = Dense169() elif modelSelection.lower() == 'mob': model = Net() elif modelSelection.lower() == 'squeeze': model = Squeeze() else: assert False, 'Wrong type of model selection string!' # Model # model = Net() # model = Squeeze() model = model.to(device) # TODO: define loss function, and optimizer learning_rate = utils.config(modelSelection + ".learning_rate") criterion = DepthLoss(0.1).to(device) optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate) number_of_epoches = 10 # # Attempts to restore the latest checkpoint if exists print("Loading unet...") model, start_epoch, stats = utils.restore_checkpoint( model, utils.config(modelSelection + ".checkpoint")) running_va_loss = [] if 'va_loss' not in stats else stats['va_loss'] running_va_acc = [] if 'va_err' not in stats else stats['va_err'] running_tr_loss = [] if 'tr_loss' not in stats else stats['tr_loss'] running_tr_acc = [] if 'tr_err' not in stats else stats['tr_err'] tr_acc, tr_loss = utils.evaluate_model(model, tr_loader, device) acc, loss = utils.evaluate_model(model, va_loader, device) running_va_acc.append(acc) running_va_loss.append(loss) running_tr_acc.append(tr_acc) running_tr_loss.append(tr_loss) stats = { 'va_err': running_va_acc, 'va_loss': running_va_loss, 'tr_err': running_tr_acc, 'tr_loss': running_tr_loss, # 'num_of_epoch': 0 } # Loop over the entire dataset multiple times # for epoch in range(start_epoch, config('cnn.num_epochs')): epoch = start_epoch # while curr_patience < patience: while epoch < number_of_epoches: # Train model utils.train_epoch(device, tr_loader, model, criterion, optimizer) # Save checkpoint utils.save_checkpoint(model, epoch + 1, utils.config(modelSelection + ".checkpoint"), stats) # Evaluate model tr_acc, tr_loss = utils.evaluate_model(model, tr_loader, device) va_acc, va_loss = utils.evaluate_model(model, va_loader, device) running_va_acc.append(va_acc) running_va_loss.append(va_loss) running_tr_acc.append(tr_acc) running_tr_loss.append(tr_loss) epoch += 1 print("Finished Training") utils.make_plot(running_tr_loss, running_tr_acc, running_va_loss, running_va_acc)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps) loss_fn = nn.CrossEntropyLoss().to(device) history = defaultdict(list) best_accuracy = 0 for epoch in range(EPOCHS): print(f'Epoch {epoch + 1}/{EPOCHS}') print('-' * 10) train_acc, train_loss = train_epoch(model, train_data_loader, loss_fn, optimizer, device, scheduler, len(df_train)) print(f'Train loss {train_loss} accuracy {train_acc}') val_acc, val_loss = eval_model(model, val_data_loader, loss_fn, device, len(df_val)) print(f'Val loss {val_loss} accuracy {val_acc}') print() history['train_acc'].append(train_acc) history['train_loss'].append(train_loss) history['val_acc'].append(val_acc) history['val_loss'].append(val_loss)
def main(): parser = argparse.ArgumentParser( description="Run MorphNet Algorithm on Image Classification Model Zoo." ) num_epochs_default = 1000 num_classes_default = 10 batch_size_default = 1024 base_model_name_default = "ResNet50" learning_rate_default = 0.0001 morphnet_regularizer_algorithm_default = "GroupLasso" morphnet_target_cost_default = "FLOPs" morphnet_hardware_default = "V100" morphnet_regularizer_threshold_default = 1e-2 morphnet_regularization_multiplier_default = 1000.0 log_dir_default = "./morphnet_log" main_train_device_default = "/cpu:0" main_eval_device_default = "/gpu:0" num_cuda_device_default = 4 random_seed_default = 0 base_model_choices = [ "ResNet50", "ResNet101", "ResNet152", "ResNet50V2", "ResNet101V2", "ResNet101V2", "ResNet152V2", "VGG16", "VGG19", "Xception", "InceptionV3", "InceptionResNetV2", "MobileNet", "MobileNetV2", "DenseNet121", "DenseNet169", "DenseNet201", "NASNetLarge", "NASNetMobile" ] morphnet_regularizer_algorithm_choices = ["GroupLasso", "Gamma"] morphnet_target_cost_choices = ["FLOPs", "Latency", "ModelSize"] morphnet_hardware_choices = ["V100", "P100", "Others"] parser.add_argument("--num-epochs", type=int, help="The number of epochs for training.", default=num_epochs_default) parser.add_argument("--num-classes", type=int, help="The number of classes for image classification.", default=num_classes_default) parser.add_argument("--batch-size", type=int, help="Batch size.", default=batch_size_default) parser.add_argument("--learning-rate", type=float, help="Learning rate.", default=learning_rate_default) parser.add_argument("--base-model-name", type=str, choices=base_model_choices, help="Select base model for image classification.", default=base_model_name_default) parser.add_argument("--morphnet-regularizer-algorithm", type=str, choices=morphnet_regularizer_algorithm_choices, help="Select MorphNet regularization algorithm.", default=morphnet_regularizer_algorithm_default) parser.add_argument("--morphnet-target-cost", type=str, choices=morphnet_target_cost_choices, help="Select MorphNet target cost.", default=morphnet_target_cost_default) parser.add_argument("--morphnet-hardware", type=str, choices=morphnet_hardware_choices, help="Select MorphNet hardware.", default=morphnet_hardware_default) parser.add_argument( "--morphnet-regularizer-threshold", type=float, help="Set the threshold [0, 1] for killing neuron layers.", default=morphnet_regularizer_threshold_default) parser.add_argument( "--morphnet-regularization-multiplier", type=float, help= "Set MorphNet regularization multiplier for regularization strength. The regularization strength for training equals the regularization multiplier divided by the initial cost of the model. Set this value to zero turns of MorphNet regularization.", default=morphnet_regularization_multiplier_default) parser.add_argument( "--log-dir", type=str, help="Log directory for TensorBoard and optimized model architectures.", default=log_dir_default) parser.add_argument("--num-cuda-device", type=int, help="Number of CUDA device to use.", default=num_cuda_device_default) parser.add_argument("--random-seed", type=int, help="Random seed.", default=random_seed_default) parser.add_argument( "--main-train-device", type=str, help="The device where the model parameters were located.", default=main_train_device_default) parser.add_argument("--main-eval-device", type=str, help="The device used for model evaluation", default=main_eval_device_default) argv = parser.parse_args() num_epochs = argv.num_epochs num_classes = argv.num_classes batch_size = argv.batch_size base_model_name = argv.base_model_name learning_rate = argv.learning_rate morphnet_regularizer_algorithm = argv.morphnet_regularizer_algorithm morphnet_target_cost = argv.morphnet_target_cost morphnet_hardware = argv.morphnet_hardware morphnet_regularizer_threshold = argv.morphnet_regularizer_threshold morphnet_regularization_multiplier = argv.morphnet_regularization_multiplier log_dir = argv.log_dir num_cuda_device = argv.num_cuda_device random_seed = argv.random_seed main_train_device = argv.main_train_device main_eval_device = argv.main_eval_device set_reproducible_environment(random_seed=random_seed) (x_train, y_train), (x_valid, y_valid) = tf.keras.datasets.cifar10.load_data() # Convert class vectors to binary class matrices. y_train_onehot = tf.keras.utils.to_categorical(y_train, num_classes) y_valid_onehot = tf.keras.utils.to_categorical(y_valid, num_classes) image_shape = x_train[1:] # Normalize image inputs x_train = x_train.astype("float32") / 255.0 x_valid = x_valid.astype("float32") / 255.0 base_model = select_keras_base_model(base_model_name=base_model_name) morphnet_regularization_strength_dummy = 1e-9 model = MorphNetModel( base_model=base_model, num_classes=num_classes, learning_rate=learning_rate, batch_size=batch_size, num_gpus=num_cuda_device, main_train_device=main_train_device, main_eval_device=main_eval_device, morphnet_regularizer_algorithm=morphnet_regularizer_algorithm, morphnet_target_cost=morphnet_target_cost, morphnet_hardware=morphnet_hardware, morphnet_regularizer_threshold=morphnet_regularizer_threshold, morphnet_regularization_strength=morphnet_regularization_strength_dummy, log_dir=log_dir) # Export the unmodified model configures. initial_cost = model.get_model_cost(inputs=x_train[:batch_size]) print("*" * 100) print("Initial Model Cost: {:.1f}".format(initial_cost)) morphnet_regularization_strength = 1.0 / initial_cost * morphnet_regularization_multiplier print("Use Regularization Strength: {}".format( morphnet_regularization_strength)) model.set_morphnet_regularization_strength( morphnet_regularization_strength=morphnet_regularization_strength) print("*" * 100) # Export the unmodified model configures. model.export_model_config_with_inputs(inputs=x_train[:batch_size]) for epoch in range(num_epochs): validate_epoch(epoch=epoch, model=model, x_valid=x_valid, y_valid_onehot=y_valid_onehot, batch_size=batch_size) train_epoch(epoch=epoch, model=model, x_train=x_train, y_train_onehot=y_train_onehot, batch_size=batch_size, shuffle=True, print_batch_info=False) # Export the model configure routinely. model.export_model_config_with_inputs(inputs=x_train[:batch_size]) validate_epoch(epoch=num_epochs, model=model, x_valid=x_valid, y_valid_onehot=y_valid_onehot, batch_size=batch_size) model.close() return 0
def main(args): savedir = "./saved-outputs/" ## randomly initialize simplexes to determine regularization parameters ## reg_pars = [] for ii in range(args.n_verts+1): fix_pts = [True]*(ii + 1) start_vert = len(fix_pts) out_dim = 100 simplex_model = SimplexNet(out_dim, VGG16Simplex, n_vert=start_vert, fix_points=fix_pts) simplex_model = simplex_model.cuda() log_vol = (simplex_model.total_volume() + 1e-4).log() reg_pars.append(max(float(args.LMBD)/log_vol, 1e-8)) ## import training and testing data ## transform_train = transforms.Compose([ transforms.RandomHorizontalFlip(), transforms.RandomCrop(32, padding=4), transforms.ToTensor(), transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)) ]) transform_test = transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)) ]) dataset = torchvision.datasets.CIFAR100(args.data_path, train=True, download=False, transform=transform_train) trainloader = DataLoader(dataset, shuffle=True, batch_size=args.batch_size, num_workers=4, pin_memory=True) testset = torchvision.datasets.CIFAR100(args.data_path, train=False, download=False, transform=transform_test) testloader = DataLoader(testset, shuffle=True, batch_size=args.batch_size, num_workers=4, pin_memory=True) columns = ['component', 'vert', 'ep', 'tr_loss', 'tr_acc', 'te_loss', 'te_acc', 'time', "vol"] for component in range(args.n_component): ## load in pre-trained model ## fix_pts = [False] simplex_model = SimplexNet(100, VGG16Simplex, n_vert=1, fix_points=fix_pts).cuda() ## add a new points and train ## for vv in range(args.n_verts): if vv == 0: optimizer = torch.optim.SGD( simplex_model.parameters(), lr=args.base_lr, momentum=0.9, weight_decay=args.wd ) else: optimizer = torch.optim.SGD( simplex_model.parameters(), lr=args.simplex_lr, momentum=0.9, weight_decay=args.wd ) criterion = torch.nn.CrossEntropyLoss() n_epoch = args.base_epochs if vv==0 else args.simplex_epochs scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=n_epoch) for epoch in range(n_epoch): time_ep = time.time() if vv == 0: train_res = utils.train_epoch(trainloader, simplex_model, criterion, optimizer) else: train_res = utils.train_epoch_volume(trainloader, simplex_model, criterion, optimizer, reg_pars[vv], args.n_sample) start_ep = (epoch == 0) eval_ep = epoch % args.eval_freq == args.eval_freq - 1 end_ep = epoch == n_epoch - 1 if start_ep or eval_ep or end_ep: test_res = utils.eval(testloader, simplex_model, criterion) else: test_res = {'loss': None, 'accuracy': None} time_ep = time.time() - time_ep lr = optimizer.param_groups[0]['lr'] scheduler.step() values = [component, vv, epoch + 1, train_res['loss'], train_res['accuracy'], test_res['loss'], test_res['accuracy'], time_ep, simplex_model.total_volume().item()] table = tabulate.tabulate([values], columns, tablefmt='simple', floatfmt='8.4f') if epoch % 40 == 0: table = table.split('\n') table = '\n'.join([table[1]] + table) else: table = table.split('\n')[2] print(table, flush=True) checkpoint = simplex_model.state_dict() fname = "base_" + str(component) + "_simplex_" + str(vv) + ".pt" torch.save(checkpoint, savedir + fname) simplex_model.add_vert() simplex_model = simplex_model.cuda()
def run_evaluation(model, ensemble_model, data_loaders, args, save_model='', load_model=''): all_values = {} device = 'cuda' utils.setup_torch(args['seed']) inputs = torch.randn( (1, args['input_channels'], args['img_size'], args['img_size'])) total_ops, total_params = profile(model, (inputs, ), verbose=True) all_values['MMACs'] = np.round(total_ops / (1000.0**2), 2) all_values['Params'] = int(total_params) print(all_values) start = time.time() model = model.to(device) ensemble_model = ensemble_model.to(device) print('models to device', time.time() - start) if len(load_model) > 0: model.load_state_dict(torch.load(os.path.join(args['dir'], load_model))) criterion = torch.nn.CrossEntropyLoss() ################################################ summary(model, (3, 32, 32), batch_size=args['batch_size'], device='cuda') criterion = torch.nn.CrossEntropyLoss().to(device) optimizer = torch.optim.SGD(model.parameters(), lr=args['lr_init'], momentum=0.9, weight_decay=1e-4) lrs = [] n_models = 0 all_values['epoch'] = [] all_values['overall_time'] = [] all_values['lr'] = [] all_values['tr_loss'] = [] all_values['tr_acc'] = [] all_values['val_loss_single'] = [] all_values['val_acc_single'] = [] all_values['val_loss_ensemble'] = [] all_values['val_acc_ensemble'] = [] all_values['test_loss_single'] = [] all_values['test_acc_single'] = [] all_values['test_loss_ensemble'] = [] all_values['test_acc_ensemble'] = [] n_models = 0 time_start = time.time() for epoch in range(args['epochs']): time_ep = time.time() lr = utils.get_cyclic_lr(epoch, lrs, args['lr_init'], args['lr_start_cycle'], args['cycle_period']) #print ('lr=%.3f' % lr) utils.set_learning_rate(optimizer, lr) lrs.append(lr) train_res = utils.train_epoch(device, data_loaders['train'], model, criterion, optimizer, args['num_samples_train']) values = [epoch + 1, lr, train_res['loss'], train_res['accuracy']] if (epoch + 1) >= args['lr_start_cycle'] and ( epoch + 1) % args['cycle_period'] == 0: all_values['epoch'].append(epoch + 1) all_values['lr'].append(lr) all_values['tr_loss'].append(train_res['loss']) all_values['tr_acc'].append(train_res['accuracy']) val_res = utils.evaluate(device, data_loaders['val'], model, criterion, args['num_samples_val']) test_res = utils.evaluate(device, data_loaders['test'], model, criterion, args['num_samples_test']) all_values['val_loss_single'].append(val_res['loss']) all_values['val_acc_single'].append(val_res['accuracy']) all_values['test_loss_single'].append(test_res['loss']) all_values['test_acc_single'].append(test_res['accuracy']) utils.moving_average_ensemble(ensemble_model, model, 1.0 / (n_models + 1)) utils.bn_update(device, data_loaders['train_for_bn_recalc'], ensemble_model) n_models += 1 val_res = utils.evaluate(device, data_loaders['val'], ensemble_model, criterion, args['num_samples_val']) test_res = utils.evaluate(device, data_loaders['test'], ensemble_model, criterion, args['num_samples_test']) all_values['val_loss_ensemble'].append(val_res['loss']) all_values['val_acc_ensemble'].append(val_res['accuracy']) all_values['test_loss_ensemble'].append(test_res['loss']) all_values['test_acc_ensemble'].append(test_res['accuracy']) overall_training_time = time.time() - time_start all_values['overall_time'].append(overall_training_time) #print (epoch, 'epoch_time', time.time() - time_ep) overall_training_time = time.time() - time_start #print ('overall time', overall_training_time) #print (all_values) if len(save_model) > 0: torch.save(ensemble_model.state_dict(), os.path.join(args['dir'], save_model + '_ensemble')) torch.save(model.state_dict(), os.path.join(args['dir'], save_model)) return all_values
def main(args): if torch.cuda.device_count() > 1: print("Let's use", torch.cuda.device_count(), "GPUs!") train_loader, test_loader = load_dataset(args.label, args.batch_size, args.half_length, args.nholes) if args.label == 10: model = ShakeResNet(args.depth, args.w_base, args.label) else: model = ShakeResNeXt(args.depth, args.w_base, args.cardinary, args.label) model = torch.nn.DataParallel(model).cuda() cudnn.benckmark = True if args.optimizer == 'sgd': print("using sgd") opt = optim.SGD(model.parameters(), lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay, nesterov=args.nesterov) elif args.optimizer == 'abd': print("using adabound") opt = abd.AdaBound(model.parameters(), lr=args.lr, gamma=args.gamma, weight_decay=args.weight_decay, final_lr=args.final_lr) elif args.optimizer == 'swa': print("using swa") opt = optim.SGD(model.parameters(), lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay) steps_per_epoch = len(train_loader.dataset) / args.batch_size steps_per_epoch = int(steps_per_epoch) opt = swa(opt, swa_start=args.swa_start * steps_per_epoch, swa_freq=steps_per_epoch, swa_lr=args.swa_lr) else: print("not valid optimizer") exit loss_func = nn.CrossEntropyLoss().cuda() headers = [ "Epoch", "LearningRate", "TrainLoss", "TestLoss", "TrainAcc.", "TestAcc." ] #if args.optimizer=='swa': # headers = headers[:-1] + ['swa_te_loss', 'swa_te_acc'] + headers[-1:] # swa_res = {'loss': None, 'accuracy': None} logger = utils.Logger(args.checkpoint, headers, mod=args.optimizer) for e in range(args.epochs): if args.optimizer == 'swa': lr = utils.schedule(e, args.optimizer, args.epochs, args.swa_start, args.swa_lr, args.lr) utils.adjust_learning_rate(opt, lr) elif args.optimizer == 'sgd': lr = utils.cosine_lr(opt, args.lr, e, args.epochs) else: exit #train train_loss, train_acc, train_n = utils.train_epoch( train_loader, model, opt) #eval test_loss, test_acc, test_n = utils.eval_epoch(test_loader, model) logger.write(e + 1, lr, train_loss / train_n, test_loss / test_n, train_acc / train_n * 100, test_acc / test_n * 100) if args.optimizer == 'swa' and ( e + 1) >= args.swa_start and args.eval_freq > 1: if e == 0 or e % args.eval_freq == args.eval_freq - 1 or e == args.epochs - 1: opt.swap_swa_sgd() opt.bn_update(train_loaders, model, device='cuda') #swa_res = utils.eval_epoch(test_loaders['test'], model) opt.swap_swa_sgd()
model.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) columns = ['ep', 'lr', 'tr_loss', 'tr_acc', 'te_loss', 'te_acc', 'time'] utils.save_checkpoint(args.dir, start_epoch, state_dict=model.state_dict(), optimizer=optimizer.state_dict()) for epoch in range(start_epoch, args.epochs): time_ep = time.time() train_res = utils.train_epoch(loaders['train'], model, criterion, optimizer, aug_reg=args.aug_reg) if epoch == 0 or epoch % args.eval_freq == args.eval_freq - 1 or epoch == args.epochs - 1: test_res = utils.eval(loaders['test'], model, criterion) else: test_res = {'loss': None, 'accuracy': None} lr = optimizer.param_groups[0]['lr'] print("Brightness", model.aug[0].lims) print("Contrast", model.aug[1].lims) scheduler.step() if (epoch + 1) % args.save_freq == 0: utils.save_checkpoint(args.dir, epoch + 1,
def train_oneshot_model(args, data_loaders, n_cells, n_choices, put_downsampling=[]): num_samples = utils.get_number_of_samples(args.dataset) device = 'cuda' utils.setup_torch(args.seed) print('Initializing model...') #Create a supernet skeleton (include all cell types for each position) propagate_weights = [[1, 1, 1] for i in range(n_cells)] model_class = getattr(models, 'Supernet') #Create the supernet model and its SWA ensemble version model = model_class(num_classes=utils.get_number_of_classes(args.dataset), propagate=propagate_weights, training=True, n_choices=n_choices, put_downsampling=put_downsampling).to(device) ensemble_model = model_class(num_classes=utils.get_number_of_classes( args.dataset), propagate=propagate_weights, training=True, n_choices=n_choices, put_downsampling=put_downsampling).to(device) #These summaries are for verification purposes only #However, removing them will cause inconsistency in results since random generators are used inside them to propagate summary(model, (3, 32, 32), batch_size=args.batch_size, device='cuda') summary(ensemble_model, (3, 32, 32), batch_size=args.batch_size, device='cuda') criterion = torch.nn.CrossEntropyLoss().to(device) optimizer = torch.optim.SGD(model.parameters(), lr=args.lr_init, momentum=0.9, weight_decay=1e-4) start_epoch = 0 columns = [ 'epoch time', 'overall training time', 'epoch', 'lr', 'train_loss', 'train_acc', 'val_loss', 'val_acc', 'test_loss', 'test_acc' ] lrs = [] n_models = 0 all_values = {} all_values['epoch'] = [] all_values['lr'] = [] all_values['tr_loss'] = [] all_values['tr_acc'] = [] all_values['val_loss'] = [] all_values['val_acc'] = [] all_values['test_loss'] = [] all_values['test_acc'] = [] n_models = 0 print('Start training...') time_start = time.time() for epoch in range(start_epoch, args.epochs): time_ep = time.time() #lr = utils.get_cosine_annealing_lr(epoch, args.lr_init, args.epochs) lr = utils.get_cyclic_lr(epoch, lrs, args.lr_init, args.lr_start_cycle, args.cycle_period) utils.set_learning_rate(optimizer, lr) lrs.append(lr) train_res = utils.train_epoch(device, data_loaders['train'], model, criterion, optimizer, num_samples['train']) values = [epoch + 1, lr, train_res['loss'], train_res['accuracy']] if (epoch + 1) >= args.lr_start_cycle and (epoch + 1) % args.cycle_period == 0: all_values['epoch'].append(epoch + 1) all_values['lr'].append(lr) all_values['tr_loss'].append(train_res['loss']) all_values['tr_acc'].append(train_res['accuracy']) val_res = utils.evaluate(device, data_loaders['val'], model, criterion, num_samples['val']) test_res = utils.evaluate(device, data_loaders['test'], model, criterion, num_samples['test']) all_values['val_loss'].append(val_res['loss']) all_values['val_acc'].append(val_res['accuracy']) all_values['test_loss'].append(test_res['loss']) all_values['test_acc'].append(test_res['accuracy']) values += [ val_res['loss'], val_res['accuracy'], test_res['loss'], test_res['accuracy'] ] utils.moving_average_ensemble(ensemble_model, model, 1.0 / (n_models + 1)) utils.bn_update(device, data_loaders['train'], ensemble_model) n_models += 1 print(all_values) overall_training_time = time.time() - time_start values = [time.time() - time_ep, overall_training_time] + values table = tabulate.tabulate([values], columns, tablefmt='simple', floatfmt='8.4f') print(table) print('Training finished. Saving final nets...') utils.save_result(all_values, args.dir, 'model_supernet') torch.save(model.state_dict(), args.dir + '/supernet.pth') torch.save(ensemble_model.state_dict(), args.dir + '/supernet_swa.pth')
total_steps = len(train_data_loader) * EPOCHS scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=0, num_training_steps=total_steps ) loss = WeightAdjustingLoss().to(device) for epoch in range(EPOCHS): start_time = time.time() train_acc, train_loss = train_epoch( model, train_data_loader, loss, optimizer, device, scheduler, 6217 ) val_acc, val_loss = eval_model( model, val_data_loader, loss, device, 777 ) end_time = time.time() epoch_mins, epoch_secs = epoch_time(start_time, end_time) print(f'Epoch::{epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s') print(f'Train Loss {train_loss} accuracy {train_acc}')
momentum=0.9, nesterov=True) best_acc = 0.0 best_epoch = None fpr = [] tpr = [] for epoch in range(MAX_EPOCH): optimizer, lr = utils.adjust_learning_rate(optimizer, epoch, 0.00001, 0.01, 8) # Training train_loss = utils.train_epoch(epoch, net, optimizer, train_dataloader, criterion) # Validating val_loss, preds, truth = utils.validate_epoch(net, test_dataloader, criterion) auc_val = roc_auc_score(truth, preds) print(epoch + 1, train_loss, val_loss, auc_val) if auc_val > best_acc: best_acc = auc_val best_epoch = epoch fpr, tpr, thresholds = roc_curve(truth, preds) plt.plot(fpr, tpr, lw=2, alpha=0.8, label='ROC (AUC = %0.2f)' % (best_acc)) plt.show()
if swa_n_ckpt is not None: swa_n = swa_n_ckpt criterion = F.cross_entropy columns = ['ep', 'lr', 'tr_loss', 'tr_acc', 'te_loss', 'te_acc', 'time'] if args.swa: columns = columns[:-1] + ['swa_te_loss', 'swa_te_acc'] + columns[-1:] swa_res = {'loss': None, 'accuracy': None} for epoch in range(start_epoch, args.ine_start): time_ep = time.time() lr = schedule_swa(epoch) utils.adjust_learning_rate(optimizer, lr) train_res = utils.train_epoch(loaders['train'], model, criterion, optimizer, 'swa', args) if epoch == 0 or epoch % args.eval_freq == args.eval_freq - 1 or epoch == args.epochs - 1: test_res = utils.eval(loaders['test'], model, criterion, 'swa') else: test_res = {'loss': None, 'accuracy': None} if args.swa and (epoch + 1) >= args.swa_start and ( epoch + 1 - args.swa_start) % args.swa_c_epochs == 0: utils.moving_average(swa_model, model, 1.0 / (swa_n + 1)) swa_n += 1 if epoch == 0 or epoch % args.eval_freq == args.eval_freq - 1 or epoch == args.epochs - 1: utils.bn_update(loaders['train'], swa_model) swa_res = utils.eval(loaders['test'], swa_model, criterion, 'swa') else: swa_res = {'loss': None, 'accuracy': None}
state_dict=model.state_dict(), optimizer=optimizer.state_dict() ) for epoch in range(start_epoch, args.epochs): time_ep = time.time() #memory_prior = torch.cuda.memory_allocated() if not args.no_schedule: lr = schedule(epoch) utils.adjust_learning_rate(optimizer, lr) else: lr = args.lr_init if (args.swa and (epoch + 1) > args.swa_start) and args.cov_mat: model_batch_means, train_res = utils.train_epoch(loaders['train'], model, criterion, optimizer, batch_means=True) else: model_batch_means = None train_res = utils.train_epoch(loaders['train'], model, criterion, optimizer) if epoch == 0 or epoch % args.eval_freq == args.eval_freq - 1 or epoch == args.epochs - 1: test_res = utils.eval(loaders['test'], model, criterion) else: test_res = {'loss': None, 'accuracy': None} if args.swa and (epoch + 1) > args.swa_start and (epoch + 1 - args.swa_start) % args.swa_c_epochs == 0: swag_model.collect_model(model, bm=model_batch_means) del model_batch_means if epoch == 0 or epoch % args.eval_freq == args.eval_freq - 1 or epoch == args.epochs - 1: swag_model.sample(0.0) utils.bn_update(loaders['train'], swag_model)
def main(args): os.makedirs("./saved-outputs/", exist_ok=True) transform_train = transforms.Compose([ transforms.RandomHorizontalFlip(), transforms.RandomCrop(32, padding=4), transforms.ToTensor(), transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)) ]) transform_test = transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)) ]) dataset = torchvision.datasets.CIFAR100(args.data_path, train=True, download=False, transform=transform_train) trainloader = DataLoader(dataset, shuffle=True, batch_size=args.batch_size) testset = torchvision.datasets.CIFAR100(args.data_path, train=False, download=False, transform=transform_test) testloader = DataLoader(testset, shuffle=True, batch_size=args.batch_size) model = VGG16(100) model = model.cuda() optimizer = torch.optim.SGD(model.parameters(), lr=args.lr_init, momentum=0.9, weight_decay=args.wd) scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=args.epochs) criterion = torch.nn.CrossEntropyLoss() columns = ['ep', 'lr', 'tr_loss', 'tr_acc', 'te_loss', 'te_acc', 'time'] for epoch in range(args.epochs): time_ep = time.time() train_res = utils.train_epoch(trainloader, model, criterion, optimizer) if epoch == 0 or epoch % args.eval_freq == args.eval_freq - 1 or epoch == args.epochs - 1: test_res = utils.eval(testloader, model, criterion) else: test_res = {'loss': None, 'accuracy': None} time_ep = time.time() - time_ep lr = optimizer.param_groups[0]['lr'] scheduler.step() values = [ epoch + 1, lr, train_res['loss'], train_res['accuracy'], test_res['loss'], test_res['accuracy'], time_ep ] table = tabulate.tabulate([values], columns, tablefmt='simple', floatfmt='8.4f') if epoch % 40 == 0: table = table.split('\n') table = '\n'.join([table[1]] + table) else: table = table.split('\n')[2] print(table, flush=True) checkpoint = model.state_dict() trial_num = len(glob.glob("./saved-outputs/model_*")) savedir = "./saved-outputs/model_" +\ str(trial_num) + "/" os.makedirs(savedir, exist_ok=True) torch.save(checkpoint, savedir + "base_model.pt")
step_size=1, gamma=lr_decay) loss_fun = torch.nn.MSELoss() min_mse = 100 train_mse = [] valid_mse = [] test_mse = [] for i in range(n_epochs): start = time.time() scheduler.step() model.train() # use train_epoch_scale/eval_epoch_scale for training scale equivariant models train_mse.append(train_epoch(train_loader, model, optimizer, loss_fun)) model.eval() mse, _, _ = eval_epoch(valid_loader, model, loss_fun) valid_mse.append(mse) if valid_mse[-1] < min_mse: min_mse = valid_mse[-1] best_model = model torch.save(best_model, save_name + ".pth") end = time.time() # Early Stopping but train at least for 50 epochs if (len(train_mse) > 50 and np.mean(valid_mse[-5:]) >= np.mean(valid_mse[-10:-5])): break print(i + 1, train_mse[-1], valid_mse[-1], round((end - start) / 60, 5),
cost = stat['embedding'] + 0. #stat['cp'][0] = ot.emd(ps, pt, stat['embedding'] ) + 0. stat['cp'][0] = ot.optim.cg(ps, pt, cost, reg, f, df, verbose=True) ###display initial guess couplings ot.plot.plot1D_mat(ps, pt, stat['cp'][0], 'OT matrix Entrop. reg') # In[9]: # pre train model on source task and save the model network = CNN().to(stat['dev']) optimizer = optim.SGD(network.parameters(), lr=1e-3, momentum=0.9, weight_decay=stat['weight_decay']) for epoch in range(30): train_epoch(network, stat, optimizer) if (epoch + 1) % 5 == 0: test(stat, network) ####saving pretrained model torch.save(network.state_dict(), os.path.join(MNIST_tran_ini, 'CNN={}.pth'.format('animal'))) # In[10]: ####loading pre-trained model network = CNN().to(stat['dev']) network.load_state_dict( torch.load(os.path.join(MNIST_tran_ini, 'CNN={}.pth'.format('animal')))) optimizer = optim.SGD(network.parameters(), lr=1e-3, momentum=0.9,
start_epoch = 0 # Prepare logging columns = ['ep', 'lr', 'tr_loss', 'tr_acc', 'te_loss', 'te_acc', 'time'] for epoch in range(start_epoch, args.epochs): time_ep = time.time() lr = schedule(epoch) grad_quantizer = lambda x: wage_qtorch.QG( x, args.wl_grad, args.wl_rand, lr, mode=args.grad_rounding) train_res = utils.train_epoch(loaders['train'], model, criterion, weight_quantizer, grad_quantizer, epoch, wage_quantize=True, wage_grad_clip=grad_clip) # Validation test_res = utils.eval(loaders['test'], model, criterion, weight_quantizer) time_ep = time.time() - time_ep values = [ epoch + 1, lr, train_res['loss'], train_res['accuracy'], test_res['loss'], test_res['accuracy'], time_ep ] table = tabulate.tabulate([values], columns,
def main(device=torch.device('cuda:0')): # CLI arguments parser = arg.ArgumentParser( description='We all know what we are doing. Fighting!') parser.add_argument("--datasize", "-d", default="small", type=str, help="data size you want to use, small, medium, total") # Parsing args = parser.parse_args() # Data loaders datasize = args.datasize pathname = "data/nyu.zip" tr_loader, va_loader, te_loader = getTrainingValidationTestingData( datasize, pathname, batch_size=config("unet.batch_size")) # Model model = Net() # TODO: define loss function, and optimizer learning_rate = utils.config("unet.learning_rate") criterion = DepthLoss(0.1) optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate) number_of_epoches = 10 # # print("Number of float-valued parameters:", util.count_parameters(model)) # Attempts to restore the latest checkpoint if exists print("Loading unet...") model, start_epoch, stats = utils.restore_checkpoint( model, utils.config("unet.checkpoint")) # axes = utils.make_training_plot() # Evaluate the randomly initialized model # evaluate_epoch( # axes, tr_loader, va_loader, te_loader, model, criterion, start_epoch, stats # ) # loss = criterion() # initial val loss for early stopping # prev_val_loss = stats[0][1] running_va_loss = [] running_va_acc = [] running_tr_loss = [] running_tr_acc = [] # TODO: define patience for early stopping # patience = 1 # curr_patience = 0 # tr_acc, tr_loss = utils.evaluate_model(model, tr_loader, device) acc, loss = utils.evaluate_model(model, va_loader, device) running_va_acc.append(acc) running_va_loss.append(loss) running_tr_acc.append(tr_acc) running_tr_loss.append(tr_loss) # Loop over the entire dataset multiple times # for epoch in range(start_epoch, config('cnn.num_epochs')): epoch = start_epoch # while curr_patience < patience: while epoch < number_of_epoches: # Train model utils.train_epoch(tr_loader, model, criterion, optimizer) tr_acc, tr_loss = utils.evaluate_model(model, tr_loader, device) va_acc, va_loss = utils.evaluate_model(model, va_loader, device) running_va_acc.append(va_acc) running_va_loss.append(va_loss) running_tr_acc.append(tr_acc) running_tr_loss.append(tr_loss) # Evaluate model # evaluate_epoch( # axes, tr_loader, va_loader, te_loader, model, criterion, epoch + 1, stats # ) # Save model parameters utils.save_checkpoint(model, epoch + 1, utils.config("unet.checkpoint"), stats) # update early stopping parameters """ curr_patience, prev_val_loss = early_stopping( stats, curr_patience, prev_val_loss ) """ epoch += 1 print("Finished Training") # Save figure and keep plot open # utils.save_training_plot() # utils.hold_training_plot() utils.make_plot(running_tr_loss, running_tr_acc, running_va_loss, running_va_acc)
def main(): ds = getattr(torchvision.datasets, args.dataset) path = os.path.join(args.data_path, args.dataset.lower()) train_set = ds(path, train=True, download=True, transform=model_cfg.transform_train) test_set = ds(path, train=False, download=True, transform=model_cfg.transform_test) loaders = { 'train': torch.utils.data.DataLoader(train_set, batch_size=args.batch_size, shuffle=True, num_workers=args.num_workers, pin_memory=True), 'test': torch.utils.data.DataLoader(test_set, batch_size=args.batch_size, shuffle=False, num_workers=args.num_workers, pin_memory=True) } num_classes = len(train_set.classes) # max(train_set.train_labels) + 1 print(num_classes) print('Preparing model') model = model_cfg.base(*model_cfg.args, num_classes=num_classes, **model_cfg.kwargs) model.cuda() if args.swa: print('SWA training') swa_model = model_cfg.base(*model_cfg.args, num_classes=num_classes, **model_cfg.kwargs) swa_model.cuda() swa_n = 0 else: print('SGD training') def schedule(epoch): t = (epoch) / (args.swa_start if args.swa else args.epochs) lr_ratio = args.swa_lr / args.lr_init if args.swa else 0.01 if t <= 0.5: factor = 1.0 elif t <= 0.9: factor = 1.0 - (1.0 - lr_ratio) * (t - 0.5) / 0.4 else: factor = lr_ratio return args.lr_init * factor criterion = F.cross_entropy optimizer = torch.optim.SGD(model.parameters(), lr=args.lr_init, momentum=args.momentum, weight_decay=args.wd) start_epoch = 0 if args.resume is not None: print('Resume training from %s' % args.resume) checkpoint = torch.load(args.resume) start_epoch = checkpoint['epoch'] model.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) if args.swa: swa_state_dict = checkpoint['swa_state_dict'] if swa_state_dict is not None: swa_model.load_state_dict(swa_state_dict) swa_n_ckpt = checkpoint['swa_n'] if swa_n_ckpt is not None: swa_n = swa_n_ckpt columns = ['ep', 'lr', 'tr_loss', 'tr_acc', 'te_loss', 'te_acc', 'time'] if args.swa: columns = columns[:-1] + ['swa_te_loss', 'swa_te_acc'] + columns[-1:] swa_res = {'loss': None, 'accuracy': None} utils.save_checkpoint( args.dir, start_epoch, state_dict=model.state_dict(), swa_state_dict=swa_model.state_dict() if args.swa else None, swa_n=swa_n if args.swa else None, optimizer=optimizer.state_dict()) for epoch in range(start_epoch, args.epochs): time_ep = time.time() lr = schedule(epoch) utils.adjust_learning_rate(optimizer, lr) train_res = utils.train_epoch(loaders['train'], model, criterion, optimizer) if epoch == 0 or epoch % args.eval_freq == args.eval_freq - 1 or epoch == args.epochs - 1: test_res = utils.eval(loaders['test'], model, criterion) else: test_res = {'loss': None, 'accuracy': None} if args.swa and (epoch + 1) >= args.swa_start and ( epoch + 1 - args.swa_start) % args.swa_c_epochs == 0: utils.moving_average(swa_model, model, 1.0 / (swa_n + 1)) swa_n += 1 if epoch == 0 or epoch % args.eval_freq == args.eval_freq - 1 or epoch == args.epochs - 1: utils.bn_update(loaders['train'], swa_model) swa_res = utils.eval(loaders['test'], swa_model, criterion) else: swa_res = {'loss': None, 'accuracy': None} if (epoch + 1) % args.save_freq == 0: utils.save_checkpoint( args.dir, epoch + 1, state_dict=model.state_dict(), swa_state_dict=swa_model.state_dict() if args.swa else None, swa_n=swa_n if args.swa else None, optimizer=optimizer.state_dict()) time_ep = time.time() - time_ep values = [ epoch + 1, lr, train_res['loss'], train_res['accuracy'], test_res['loss'], test_res['accuracy'], time_ep ] if args.swa: values = values[:-1] + [swa_res['loss'], swa_res['accuracy'] ] + values[-1:] table = tabulate.tabulate([values], columns, tablefmt='simple', floatfmt='8.4f') if epoch % 40 == 0: table = table.split('\n') table = '\n'.join([table[1]] + table) else: table = table.split('\n')[2] print(table) if args.epochs % args.save_freq != 0: utils.save_checkpoint( args.dir, args.epochs, state_dict=model.state_dict(), swa_state_dict=swa_model.state_dict() if args.swa else None, swa_n=swa_n if args.swa else None, optimizer=optimizer.state_dict())