示例#1
0
def train(name, model, training_data, validation_data, crit, optimizer, scheduler, opt):

    valid_aucs = [0.]
    for epoch_i in range(opt.epoch):
        print('[ Epoch', epoch_i, ']')

        start = time.time()
        train_loss, train_auc = train_epoch(model, training_data, crit, optimizer)
        print('  - (Training)   loss: {loss: 8.5f}, auc: {auc:3.3f} %, '\
              'elapse: {elapse:3.3f} min'.format(
                  loss=train_loss, auc=100*train_auc,
                  elapse=(time.time()-start)/60))
        
        
        start = time.time()
        valid_loss, valid_auc, valid_proba = eval_epoch(model, validation_data, crit)

        print('  - (Validation) loss: {loss: 8.5f}, auc: {auc:3.3f} %, '\
                'elapse: {elapse:3.3f} min'.format(
                    loss=valid_loss, auc=100*valid_auc,
                    elapse=(time.time()-start)/60))
        
        best_loss = max(valid_aucs)
        valid_aucs += [valid_auc]
        scheduler.step(valid_loss)

        model_state_dict = model.state_dict()
        checkpoint = {
            'model': model_state_dict,
            'settings': opt,
            'epoch': epoch_i,
            'auc': valid_auc}

        model_name = name + '.chkpt'
        if valid_auc >= best_loss:
            print('new best loss:', valid_auc)
            best_proba = valid_proba
            best_model = model
            if opt.save_model:
                torch.save(checkpoint, 'models/'+model_name)
                print('    - [Info] The checkpoint file has been updated.')

        if opt.log:
            directory = 'predictions/' + opt.name
            log_train_file = directory + '/train.log'
            log_valid_file = directory + '/valid.log'

            with open(log_train_file, 'a') as log_tf, open(log_valid_file, 'a') as log_vf:
                log_tf.write('{fold},{epoch},{loss: 8.5f},{auc:3.3f}\n'.format(
                    fold=name, epoch=epoch_i, loss=train_loss, auc=100*train_auc))
                log_vf.write('{fold},{epoch},{loss: 8.5f},{auc:3.3f}\n'.format(
                    fold=name, epoch=epoch_i, loss=valid_loss, auc=100*valid_auc))

    return best_model, best_proba
def swa_train(model, swa_model, train_iter, valid_iter, optimizer, criterion, pretrain_epochs, swa_epochs, swa_lr, cycle_length, device, writer, cpt_filename):
    swa_n = 1

    swa_model.load_state_dict(copy.deepcopy(model.state_dict()))

    utils.save_checkpoint(
        cpt_directory,
        1,
        '{}-swa-{:2.4f}-{:03d}-{}'.format(date, swa_lr, cycle_length, cpt_filename),
        state_dict=model.state_dict(),
        swa_state_dict=swa_model.state_dict(),
        swa_n=swa_n,
        optimizer=optimizer.state_dict()
    )

    for e in range(swa_epochs):
        epoch = e + pretrain_epochs
        time_ep = time.time()
        lr = utils.schedule(epoch, cycle_length, lr_init, swa_lr)
        utils.adjust_learning_rate(optimizer, lr)

        train_res = utils.train_epoch(model, train_iter, optimizer, criterion, device)
        valid_res = utils.evaluate(model, valid_iter, criterion, device)

        utils.moving_average(swa_model, model, swa_n)
        swa_n += 1
        utils.bn_update(train_iter, swa_model)
        swa_res = utils.evaluate(swa_model, valid_iter, criterion, device)

        time_ep = time.time() - time_ep
        values = [epoch + 1, lr, swa_lr, cycle_length, train_res['loss'], valid_res['loss'], swa_res['loss'], None, None, time_ep]
        writer.writerow(values)

        table = tabulate.tabulate([values], columns, tablefmt='simple', floatfmt='8.4f')
        if epoch % 20 == 0:
            table = table.split('\n')
            table = '\n'.join([table[1]] + table)
        else:
            table = table.split('\n')[2]
        print(table)

        utils.save_checkpoint(
            cpt_directory,
            epoch + 1,
            '{}-swa-{:2.4f}-{:03d}-{}'.format(date, swa_lr, cycle_length, cpt_filename),
            state_dict=model.state_dict(),
            swa_state_dict=swa_model.state_dict(),
            swa_n=swa_n,
            optimizer=optimizer.state_dict()
        )
def train(model, train_iter, valid_iter, optimizer, criterion, pretrain_epochs, device, writer, cpt_filename):
    print(f'The model has {utils.count_parameters(model):,} trainable parameters')

    lr = lr_init

    utils.save_checkpoint(
        cpt_directory,
        1,
        date + "-" + cpt_filename,
        state_dict=model.state_dict(),
        optimizer=optimizer.state_dict()
    )

    for epoch in range(pretrain_epochs):
        time_ep = time.time()
        swa = epoch > pretrain_epochs

        train_res = utils.train_epoch(model, train_iter, optimizer, criterion, device)
        valid_res = utils.evaluate(model, valid_iter, criterion, device)

        time_ep = time.time() - time_ep
        values = [epoch + 1, lr, None, None, train_res['loss'], valid_res['loss'], None, None, None, time_ep]
        writer.writerow(values)

        table = tabulate.tabulate([values], columns, tablefmt='simple', floatfmt='8.4f')
        if epoch % 20 == 0:
            table = table.split('\n')
            table = '\n'.join([table[1]] + table)
        else:
            table = table.split('\n')[2]
        print(table)

        if (epoch + 1) % save_freq == 0 or swa:
            utils.save_checkpoint(
                cpt_directory,
                epoch + 1,
                date + "final-" + cpt_filename,
                state_dict=model.state_dict(),
                optimizer=optimizer.state_dict()
            )
示例#4
0
train_ds = PAN2020(train_dir, tokenizer, seq_len, 3)
test_ds  = PAN2020(test_dir, tokenizer, seq_len, 3)

train_dataloader = DataLoader(
    train_ds,
    sampler = RandomSampler(train_ds),
    batch_size = batch_size
)

# train_dataloader = DataLoader(
#     test_ds,
#     sampler = RandomSampler(test_ds),
#     batch_size = batch_size
# )

test_dataloader = DataLoader(
    test_ds,
    sampler = RandomSampler(test_ds),
    batch_size = batch_size
)

optimizer = AdamW(optimizer_grouped_parameters, lr=lr)

model.train()

epochs_it = trange(epochs, desc='Epoch', mininterval=0)
global_step = 0

for e in epochs_it:
    global_step = train_epoch(model, optimizer, train_dataloader, test_dataloader, device, tb_writer, global_step=global_step)
示例#5
0
def main():
    args = parser()

    torch.backends.cudnn.benchmark = True
    torch.manual_seed(args.seed)
    torch.cuda.manual_seed(args.seed)

    ######
    # prepare model and dataset
    ######

    print("Using model %s" % args.model)
    model_cfg = getattr(models, args.model)

    loaders, num_classes, num_data = generate_data(args, model_cfg)
    train_loader = loaders["train"]
    test_loader = loaders["test"]

    print("Preparing model")
    model = model_cfg.base(*model_cfg.args,
                           num_classes=num_classes,
                           **model_cfg.kwargs)
    model.cuda()

    ## please note that this code will only work if the checkpoints are saved as cuda tensors
    if args.resume is not None:
        print("Loading Model")
        checkpoint = torch.load(args.resume)
        start_epoch = checkpoint["epoch"]
        model.load_state_dict(checkpoint["state_dict"])

    model.eval()
    pars = []
    param_name_list = list(model.named_parameters())
    for i, (n, p) in enumerate(param_name_list):
        if i < len(param_name_list) - 2:
            p.requires_grad = False
        else:
            pars.append(p)

    optimizer = torch.optim.Adam(pars, lr=args.lr_init, amsgrad=True)

    criterion = cross_entropy

    if args.epochs == 0:
        eval_dict = eval(loader=test_loader,
                         model=model,
                         criterion=criterion,
                         verbose=True)
        print("Eval loss: {} Eval acc: {}".format(eval_dict["loss"],
                                                  eval_dict["accuracy"]))

    for epoch in range(args.epochs):
        train_epoch(
            loader=train_loader,
            model=model,
            criterion=criterion,
            optimizer=optimizer,
            verbose=True,
        )
        eval_dict = eval(loader=test_loader,
                         model=model,
                         criterion=criterion,
                         verbose=True)
        print("Eval loss: {} Eval acc: {}".format(eval_dict["loss"],
                                                  eval_dict["accuracy"]))

    if args.save_path is not None:
        print("Saving predictions to ", args.save_path)
        predictions_dict = predict(loader=test_loader,
                                   model=model,
                                   criterion=criterion,
                                   verbose=True)
        np.savez(
            args.save_path,
            predictions=predictions_dict["predictions"],
            targets=predictions_dict["targets"],
        )
                            BATCH_SIZE,
                            rootdir=train_dir)

net = get_resnet50(n_class=len(le.classes_))
criterion_train = nn.CrossEntropyLoss()
criterion_val = nn.CrossEntropyLoss()

optimizer = optim.Adam(net.fc.parameters(),
                       lr=0.0001)  # use default learning rate
state = {'val_acc': [], 'lives': 4, 'best_val_acc': 0}

if CUDA:
    net.cuda()
for epoch in range(EPOCHS):
    print("Epoch: ", epoch + 1)
    train_acc = train_epoch(net, dog_train, criterion_train, optimizer, CUDA)
    print("Evaluating...")
    val_acc = val_epoch(net, dog_val, criterion_val, CUDA)

    state['val_acc'].append(val_acc)
    if val_acc > state['best_val_acc']:
        state['lives'] = 4
        state['best_val_acc'] = val_acc
    else:
        state['lives'] -= 1
        print("Trial left :", state['lives'])
        if state['lives'] == 2:
            optimizer.param_groups[0]['lr'] /= 2
        if state['lives'] == 0:
            break
示例#7
0
    swa_res = {'loss': None, 'accuracy': None}

utils.save_checkpoint(
    args.dir,
    start_epoch,
    state_dict=model.state_dict(),
    swa_state_dict=swa_model.state_dict() if args.swa else None,
    swa_n=swa_n if args.swa else None,
    optimizer=optimizer.state_dict())

for epoch in range(start_epoch, args.epochs):
    time_ep = time.time()

    lr = schedule(epoch)
    utils.adjust_learning_rate(optimizer, lr)
    train_res = utils.train_epoch(loaders['train'], model, criterion,
                                  optimizer)
    if epoch == 0 or epoch % args.eval_freq == args.eval_freq - 1 or epoch == args.epochs - 1:
        test_res = utils.eval(loaders['test'], model, criterion)
    else:
        test_res = {'loss': None, 'accuracy': None}

    if args.swa and (epoch + 1) >= args.swa_start and (
            epoch + 1 - args.swa_start) % args.swa_c_epochs == 0:
        utils.moving_average(swa_model, model, 1.0 / (swa_n + 1))
        swa_n += 1
        if epoch == 0 or epoch % args.eval_freq == args.eval_freq - 1 or epoch == args.epochs - 1:
            utils.bn_update(loaders['train'], swa_model)
            swa_res = utils.eval(loaders['test'], swa_model, criterion)
        else:
            swa_res = {'loss': None, 'accuracy': None}
示例#8
0
def main(device, tr_loader, va_loader, te_loader, modelSelection):
    """Train CNN and show training plots."""
    # Model
    if modelSelection.lower() == 'res50':
        model = Res50()
    elif modelSelection.lower() == 'dense121':
        model = Dense121()
    elif modelSelection.lower() == 'dense161':
        model = Dense161()
    elif modelSelection.lower() == 'mobv2':
        model = Mob_v2()
    elif modelSelection.lower() == 'dense169':
        model = Dense169()
    elif modelSelection.lower() == 'mob':
        model = Net()
    elif modelSelection.lower() == 'squeeze':
        model = Squeeze()
    else:
        assert False, 'Wrong type of model selection string!'
    model = model.to(device)

    # TODO: define loss function, and optimizer
    learning_rate = utils.config(modelSelection + ".learning_rate")
    criterion = DepthLoss(0.1).to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
    number_of_epoches = 10
    #

    # Attempts to restore the latest checkpoint if exists
    print("Loading unet...")
    model, start_epoch, stats = utils.restore_checkpoint(
        model, utils.config(modelSelection + ".checkpoint"))

    running_va_loss = [] if 'va_loss' not in stats else stats['va_loss']
    running_va_acc = [] if 'va_err' not in stats else stats['va_err']
    running_tr_loss = [] if 'tr_loss' not in stats else stats['tr_loss']
    running_tr_acc = [] if 'tr_err' not in stats else stats['tr_err']
    tr_acc, tr_loss = utils.evaluate_model(model, tr_loader, device)
    acc, loss = utils.evaluate_model(model, va_loader, device)
    running_va_acc.append(acc)
    running_va_loss.append(loss)
    running_tr_acc.append(tr_acc)
    running_tr_loss.append(tr_loss)
    stats = {
        'va_err': running_va_acc,
        'va_loss': running_va_loss,
        'tr_err': running_tr_acc,
        'tr_loss': running_tr_loss,
    }
    # Loop over the entire dataset multiple times
    # for epoch in range(start_epoch, config('cnn.num_epochs')):
    epoch = start_epoch
    # while curr_patience < patience:
    while epoch < number_of_epoches:
        # Train model
        utils.train_epoch(device, tr_loader, model, criterion, optimizer)
        # Save checkpoint
        utils.save_checkpoint(model, epoch + 1,
                              utils.config(modelSelection + ".checkpoint"),
                              stats)
        # Evaluate model
        tr_acc, tr_loss = utils.evaluate_model(model, tr_loader, device)
        va_acc, va_loss = utils.evaluate_model(model, va_loader, device)
        running_va_acc.append(va_acc)
        running_va_loss.append(va_loss)
        running_tr_acc.append(tr_acc)
        running_tr_loss.append(tr_loss)
        epoch += 1
    print("Finished Training")
    utils.make_plot(running_tr_loss, running_tr_acc, running_va_loss,
                    running_va_acc)
示例#9
0
def main():
    args = parser()

    torch.backends.cudnn.benchmark = True
    torch.manual_seed(args.seed)
    torch.cuda.manual_seed(args.seed)

    ######
    # prepare model and dataset
    ######

    print("Using model %s" % args.model)
    model_cfg = getattr(models, args.model)

    loaders, num_classes, num_data = generate_data(args, model_cfg)
    train_loader = loaders["train"]
    test_loader = loaders["test"]

    print("Preparing model")
    model = model_cfg.base(*model_cfg.args,
                           num_classes=num_classes,
                           **model_cfg.kwargs)
    model.cuda()

    ## please note that this code will only work if the checkpoints are saved as cuda tensors
    print("Loading Model")
    checkpoint = torch.load(args.resume)
    start_epoch = checkpoint["epoch"]
    model.load_state_dict(checkpoint["net"])

    #######
    # prepare linearized model by cloning parameters
    current_pars = finite_ntk.lazy.utils.flatten(model.parameters())
    # but dont initialize to zero so we add a little bit of noise
    eps = 1e-6
    pars = torch.clone(current_pars.data) + eps
    pars = pars.detach_().view(-1, 1)
    pars.requires_grad = True

    if args.inference == "vi":
        sigma_pars = -5.0 * torch.ones_like(pars)

        pars = [pars, sigma_pars]
    else:
        pars = [pars]

    optimizer = torch.optim.Adam(pars, lr=args.lr_init, amsgrad=True)

    # set model in eval mode to freeze batch norm and dropout
    model.eval()

    loss_args = [
        model, num_classes, args.bias, args.wd, current_pars, num_data
    ]
    loss_instances = {
        "map": losses.map_crossentropy,
        "laplace": losses.laplace_crossentropy,
        "vi": losses.vi_crossentropy,
    }

    try:
        loss_func = loss_instances[args.inference]
        criterion = loss_func(*loss_args)
        eval_criterion = loss_func(*loss_args, eval=True)
    except:
        raise ValueError("Inference method not found")

    if args.epochs == 0:
        eval_dict = eval(loader=test_loader,
                         model=pars,
                         criterion=criterion,
                         verbose=True)
        print("Eval loss: {} Eval acc: {}".format(eval_dict["loss"],
                                                  eval_dict["accuracy"]))

    for epoch in range(args.epochs):
        train_epoch(
            loader=train_loader,
            model=pars,
            criterion=criterion,
            optimizer=optimizer,
            verbose=True,
        )
        if epoch % args.eval_freq == 0:
            eval_dict = eval(loader=test_loader,
                             model=pars,
                             criterion=eval_criterion,
                             verbose=True)
            print("Eval loss: {} Eval acc: {}".format(eval_dict["loss"],
                                                      eval_dict["accuracy"]))

    if args.save_path is not None:
        print("Saving predictions to ", args.save_path)
        predictions_dict = predict(loader=test_loader,
                                   model=pars,
                                   criterion=eval_criterion,
                                   verbose=True)
        np.savez(
            args.save_path,
            weights=pars[0].detach().cpu().numpy(),
            predictions=predictions_dict["predictions"],
            targets=predictions_dict["targets"],
        )
示例#10
0
def main(device, tr_loader, va_loader, te_loader, modelSelection):
    """Train CNN and show training plots."""
    # CLI arguments
    # parser = arg.ArgumentParser(description='We all know what we are doing. Fighting!')
    # parser.add_argument("--datasize", "-d", default="small", type=str,
    #                     help="data size you want to use, small, medium, total")
    # Parsing
    # args = parser.parse_args()
    # Data loaders
    # datasize = args.datasize
    # Model
    if modelSelection.lower() == 'res50':
        model = Res50()
    elif modelSelection.lower() == 'dense121':
        model = Dense121()
    elif modelSelection.lower() == 'mobv2':
        model = Mob_v2()
    elif modelSelection.lower() == 'dense169':
        model = Dense169()
    elif modelSelection.lower() == 'mob':
        model = Net()
    elif modelSelection.lower() == 'squeeze':
        model = Squeeze()
    else:
        assert False, 'Wrong type of model selection string!'
    # Model
    # model = Net()
    # model = Squeeze()
    model = model.to(device)

    # TODO: define loss function, and optimizer
    learning_rate = utils.config(modelSelection + ".learning_rate")
    criterion = DepthLoss(0.1).to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
    number_of_epoches = 10
    #

    # Attempts to restore the latest checkpoint if exists
    print("Loading unet...")
    model, start_epoch, stats = utils.restore_checkpoint(
        model, utils.config(modelSelection + ".checkpoint"))

    running_va_loss = [] if 'va_loss' not in stats else stats['va_loss']
    running_va_acc = [] if 'va_err' not in stats else stats['va_err']
    running_tr_loss = [] if 'tr_loss' not in stats else stats['tr_loss']
    running_tr_acc = [] if 'tr_err' not in stats else stats['tr_err']
    tr_acc, tr_loss = utils.evaluate_model(model, tr_loader, device)
    acc, loss = utils.evaluate_model(model, va_loader, device)
    running_va_acc.append(acc)
    running_va_loss.append(loss)
    running_tr_acc.append(tr_acc)
    running_tr_loss.append(tr_loss)
    stats = {
        'va_err': running_va_acc,
        'va_loss': running_va_loss,
        'tr_err': running_tr_acc,
        'tr_loss': running_tr_loss,
        # 'num_of_epoch': 0
    }
    # Loop over the entire dataset multiple times
    # for epoch in range(start_epoch, config('cnn.num_epochs')):
    epoch = start_epoch
    # while curr_patience < patience:
    while epoch < number_of_epoches:
        # Train model
        utils.train_epoch(device, tr_loader, model, criterion, optimizer)
        # Save checkpoint
        utils.save_checkpoint(model, epoch + 1,
                              utils.config(modelSelection + ".checkpoint"),
                              stats)
        # Evaluate model
        tr_acc, tr_loss = utils.evaluate_model(model, tr_loader, device)
        va_acc, va_loss = utils.evaluate_model(model, va_loader, device)
        running_va_acc.append(va_acc)
        running_va_loss.append(va_loss)
        running_tr_acc.append(tr_acc)
        running_tr_loss.append(tr_loss)
        epoch += 1
    print("Finished Training")
    utils.make_plot(running_tr_loss, running_tr_acc, running_va_loss,
                    running_va_acc)
示例#11
0
scheduler = get_linear_schedule_with_warmup(optimizer,
                                            num_warmup_steps=0,
                                            num_training_steps=total_steps)

loss_fn = nn.CrossEntropyLoss().to(device)

history = defaultdict(list)
best_accuracy = 0

for epoch in range(EPOCHS):

    print(f'Epoch {epoch + 1}/{EPOCHS}')
    print('-' * 10)

    train_acc, train_loss = train_epoch(model, train_data_loader, loss_fn,
                                        optimizer, device, scheduler,
                                        len(df_train))

    print(f'Train loss {train_loss} accuracy {train_acc}')

    val_acc, val_loss = eval_model(model, val_data_loader, loss_fn, device,
                                   len(df_val))

    print(f'Val   loss {val_loss} accuracy {val_acc}')
    print()

    history['train_acc'].append(train_acc)
    history['train_loss'].append(train_loss)
    history['val_acc'].append(val_acc)
    history['val_loss'].append(val_loss)
示例#12
0
def main():

    parser = argparse.ArgumentParser(
        description="Run MorphNet Algorithm on Image Classification Model Zoo."
    )

    num_epochs_default = 1000
    num_classes_default = 10
    batch_size_default = 1024
    base_model_name_default = "ResNet50"
    learning_rate_default = 0.0001
    morphnet_regularizer_algorithm_default = "GroupLasso"
    morphnet_target_cost_default = "FLOPs"
    morphnet_hardware_default = "V100"
    morphnet_regularizer_threshold_default = 1e-2
    morphnet_regularization_multiplier_default = 1000.0
    log_dir_default = "./morphnet_log"
    main_train_device_default = "/cpu:0"
    main_eval_device_default = "/gpu:0"
    num_cuda_device_default = 4
    random_seed_default = 0
    base_model_choices = [
        "ResNet50", "ResNet101", "ResNet152", "ResNet50V2", "ResNet101V2",
        "ResNet101V2", "ResNet152V2", "VGG16", "VGG19", "Xception",
        "InceptionV3", "InceptionResNetV2", "MobileNet", "MobileNetV2",
        "DenseNet121", "DenseNet169", "DenseNet201", "NASNetLarge",
        "NASNetMobile"
    ]
    morphnet_regularizer_algorithm_choices = ["GroupLasso", "Gamma"]
    morphnet_target_cost_choices = ["FLOPs", "Latency", "ModelSize"]
    morphnet_hardware_choices = ["V100", "P100", "Others"]

    parser.add_argument("--num-epochs",
                        type=int,
                        help="The number of epochs for training.",
                        default=num_epochs_default)
    parser.add_argument("--num-classes",
                        type=int,
                        help="The number of classes for image classification.",
                        default=num_classes_default)
    parser.add_argument("--batch-size",
                        type=int,
                        help="Batch size.",
                        default=batch_size_default)
    parser.add_argument("--learning-rate",
                        type=float,
                        help="Learning rate.",
                        default=learning_rate_default)
    parser.add_argument("--base-model-name",
                        type=str,
                        choices=base_model_choices,
                        help="Select base model for image classification.",
                        default=base_model_name_default)
    parser.add_argument("--morphnet-regularizer-algorithm",
                        type=str,
                        choices=morphnet_regularizer_algorithm_choices,
                        help="Select MorphNet regularization algorithm.",
                        default=morphnet_regularizer_algorithm_default)
    parser.add_argument("--morphnet-target-cost",
                        type=str,
                        choices=morphnet_target_cost_choices,
                        help="Select MorphNet target cost.",
                        default=morphnet_target_cost_default)
    parser.add_argument("--morphnet-hardware",
                        type=str,
                        choices=morphnet_hardware_choices,
                        help="Select MorphNet hardware.",
                        default=morphnet_hardware_default)
    parser.add_argument(
        "--morphnet-regularizer-threshold",
        type=float,
        help="Set the threshold [0, 1] for killing neuron layers.",
        default=morphnet_regularizer_threshold_default)
    parser.add_argument(
        "--morphnet-regularization-multiplier",
        type=float,
        help=
        "Set MorphNet regularization multiplier for regularization strength. The regularization strength for training equals the regularization multiplier divided by the initial cost of the model. Set this value to zero turns of MorphNet regularization.",
        default=morphnet_regularization_multiplier_default)
    parser.add_argument(
        "--log-dir",
        type=str,
        help="Log directory for TensorBoard and optimized model architectures.",
        default=log_dir_default)
    parser.add_argument("--num-cuda-device",
                        type=int,
                        help="Number of CUDA device to use.",
                        default=num_cuda_device_default)
    parser.add_argument("--random-seed",
                        type=int,
                        help="Random seed.",
                        default=random_seed_default)
    parser.add_argument(
        "--main-train-device",
        type=str,
        help="The device where the model parameters were located.",
        default=main_train_device_default)
    parser.add_argument("--main-eval-device",
                        type=str,
                        help="The device used for model evaluation",
                        default=main_eval_device_default)

    argv = parser.parse_args()

    num_epochs = argv.num_epochs
    num_classes = argv.num_classes
    batch_size = argv.batch_size
    base_model_name = argv.base_model_name
    learning_rate = argv.learning_rate
    morphnet_regularizer_algorithm = argv.morphnet_regularizer_algorithm
    morphnet_target_cost = argv.morphnet_target_cost
    morphnet_hardware = argv.morphnet_hardware
    morphnet_regularizer_threshold = argv.morphnet_regularizer_threshold
    morphnet_regularization_multiplier = argv.morphnet_regularization_multiplier
    log_dir = argv.log_dir
    num_cuda_device = argv.num_cuda_device
    random_seed = argv.random_seed
    main_train_device = argv.main_train_device
    main_eval_device = argv.main_eval_device

    set_reproducible_environment(random_seed=random_seed)

    (x_train, y_train), (x_valid,
                         y_valid) = tf.keras.datasets.cifar10.load_data()
    # Convert class vectors to binary class matrices.
    y_train_onehot = tf.keras.utils.to_categorical(y_train, num_classes)
    y_valid_onehot = tf.keras.utils.to_categorical(y_valid, num_classes)
    image_shape = x_train[1:]
    # Normalize image inputs
    x_train = x_train.astype("float32") / 255.0
    x_valid = x_valid.astype("float32") / 255.0

    base_model = select_keras_base_model(base_model_name=base_model_name)
    morphnet_regularization_strength_dummy = 1e-9
    model = MorphNetModel(
        base_model=base_model,
        num_classes=num_classes,
        learning_rate=learning_rate,
        batch_size=batch_size,
        num_gpus=num_cuda_device,
        main_train_device=main_train_device,
        main_eval_device=main_eval_device,
        morphnet_regularizer_algorithm=morphnet_regularizer_algorithm,
        morphnet_target_cost=morphnet_target_cost,
        morphnet_hardware=morphnet_hardware,
        morphnet_regularizer_threshold=morphnet_regularizer_threshold,
        morphnet_regularization_strength=morphnet_regularization_strength_dummy,
        log_dir=log_dir)

    # Export the unmodified model configures.
    initial_cost = model.get_model_cost(inputs=x_train[:batch_size])
    print("*" * 100)
    print("Initial Model Cost: {:.1f}".format(initial_cost))
    morphnet_regularization_strength = 1.0 / initial_cost * morphnet_regularization_multiplier
    print("Use Regularization Strength: {}".format(
        morphnet_regularization_strength))
    model.set_morphnet_regularization_strength(
        morphnet_regularization_strength=morphnet_regularization_strength)
    print("*" * 100)
    # Export the unmodified model configures.
    model.export_model_config_with_inputs(inputs=x_train[:batch_size])

    for epoch in range(num_epochs):
        validate_epoch(epoch=epoch,
                       model=model,
                       x_valid=x_valid,
                       y_valid_onehot=y_valid_onehot,
                       batch_size=batch_size)
        train_epoch(epoch=epoch,
                    model=model,
                    x_train=x_train,
                    y_train_onehot=y_train_onehot,
                    batch_size=batch_size,
                    shuffle=True,
                    print_batch_info=False)
        # Export the model configure routinely.
        model.export_model_config_with_inputs(inputs=x_train[:batch_size])

    validate_epoch(epoch=num_epochs,
                   model=model,
                   x_valid=x_valid,
                   y_valid_onehot=y_valid_onehot,
                   batch_size=batch_size)

    model.close()

    return 0
示例#13
0
def main(args):
    savedir = "./saved-outputs/"
    
    ## randomly initialize simplexes to determine regularization parameters ##
    reg_pars = []
    for ii in range(args.n_verts+1):
        fix_pts = [True]*(ii + 1)
        start_vert = len(fix_pts)

        out_dim = 100
        simplex_model = SimplexNet(out_dim, VGG16Simplex, n_vert=start_vert,
                               fix_points=fix_pts)
        simplex_model = simplex_model.cuda()
        
        log_vol = (simplex_model.total_volume() + 1e-4).log()
        
        reg_pars.append(max(float(args.LMBD)/log_vol, 1e-8))
    
    
    ## import training and testing data ##
    transform_train = transforms.Compose([
        transforms.RandomHorizontalFlip(),
        transforms.RandomCrop(32, padding=4),
        transforms.ToTensor(),
        transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))
    ])

    transform_test = transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))
    ])
    dataset = torchvision.datasets.CIFAR100(args.data_path, 
                                           train=True, download=False,
                                           transform=transform_train)
    trainloader = DataLoader(dataset, shuffle=True, batch_size=args.batch_size,
                                num_workers=4, pin_memory=True)
    
    testset = torchvision.datasets.CIFAR100(args.data_path, 
                                           train=False, download=False,
                                           transform=transform_test)
    testloader = DataLoader(testset, shuffle=True, batch_size=args.batch_size,
                                num_workers=4, pin_memory=True)
    
    columns = ['component', 'vert', 'ep', 'tr_loss', 
           'tr_acc', 'te_loss', 'te_acc', 'time', "vol"]
    for component in range(args.n_component):
        ## load in pre-trained model ##
        fix_pts = [False]
        simplex_model = SimplexNet(100, VGG16Simplex, n_vert=1,
                               fix_points=fix_pts).cuda()


        ## add a new points and train ##
        for vv in range(args.n_verts):
            if vv == 0:
                optimizer = torch.optim.SGD(
                    simplex_model.parameters(),
                    lr=args.base_lr,
                    momentum=0.9,
                    weight_decay=args.wd
                )
            else:
                optimizer = torch.optim.SGD(
                    simplex_model.parameters(),
                    lr=args.simplex_lr,
                    momentum=0.9,
                    weight_decay=args.wd
                )

            criterion = torch.nn.CrossEntropyLoss()
            n_epoch = args.base_epochs if vv==0 else args.simplex_epochs
            scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, 
                                                                   T_max=n_epoch)

            for epoch in range(n_epoch):
                time_ep = time.time()
                if vv == 0:
                    train_res = utils.train_epoch(trainloader, simplex_model, 
                                                  criterion, optimizer)
                else:
                    train_res = utils.train_epoch_volume(trainloader, simplex_model, 
                                                         criterion, optimizer, 
                                                         reg_pars[vv], args.n_sample)

                start_ep = (epoch == 0)
                eval_ep = epoch % args.eval_freq == args.eval_freq - 1
                end_ep = epoch == n_epoch - 1
                if start_ep or eval_ep or end_ep:
                    test_res = utils.eval(testloader, simplex_model, criterion)
                else:
                    test_res = {'loss': None, 'accuracy': None}

                time_ep = time.time() - time_ep

                lr = optimizer.param_groups[0]['lr']
                scheduler.step()

                values = [component, vv, epoch + 1, 
                          train_res['loss'], train_res['accuracy'], 
                          test_res['loss'], test_res['accuracy'], time_ep,
                         simplex_model.total_volume().item()]

                table = tabulate.tabulate([values], columns, 
                                          tablefmt='simple', floatfmt='8.4f')
                if epoch % 40 == 0:
                    table = table.split('\n')
                    table = '\n'.join([table[1]] + table)
                else:
                    table = table.split('\n')[2]
                print(table, flush=True)

            checkpoint = simplex_model.state_dict()
            fname = "base_" + str(component) + "_simplex_" + str(vv) + ".pt"
            torch.save(checkpoint, savedir + fname) 

            simplex_model.add_vert()
            simplex_model = simplex_model.cuda()
示例#14
0
def run_evaluation(model,
                   ensemble_model,
                   data_loaders,
                   args,
                   save_model='',
                   load_model=''):

    all_values = {}
    device = 'cuda'

    utils.setup_torch(args['seed'])

    inputs = torch.randn(
        (1, args['input_channels'], args['img_size'], args['img_size']))
    total_ops, total_params = profile(model, (inputs, ), verbose=True)
    all_values['MMACs'] = np.round(total_ops / (1000.0**2), 2)
    all_values['Params'] = int(total_params)
    print(all_values)

    start = time.time()
    model = model.to(device)
    ensemble_model = ensemble_model.to(device)
    print('models to device', time.time() - start)

    if len(load_model) > 0:
        model.load_state_dict(torch.load(os.path.join(args['dir'],
                                                      load_model)))

    criterion = torch.nn.CrossEntropyLoss()

    ################################################

    summary(model, (3, 32, 32), batch_size=args['batch_size'], device='cuda')

    criterion = torch.nn.CrossEntropyLoss().to(device)
    optimizer = torch.optim.SGD(model.parameters(),
                                lr=args['lr_init'],
                                momentum=0.9,
                                weight_decay=1e-4)

    lrs = []
    n_models = 0

    all_values['epoch'] = []
    all_values['overall_time'] = []
    all_values['lr'] = []

    all_values['tr_loss'] = []
    all_values['tr_acc'] = []

    all_values['val_loss_single'] = []
    all_values['val_acc_single'] = []
    all_values['val_loss_ensemble'] = []
    all_values['val_acc_ensemble'] = []

    all_values['test_loss_single'] = []
    all_values['test_acc_single'] = []
    all_values['test_loss_ensemble'] = []
    all_values['test_acc_ensemble'] = []

    n_models = 0
    time_start = time.time()

    for epoch in range(args['epochs']):
        time_ep = time.time()

        lr = utils.get_cyclic_lr(epoch, lrs, args['lr_init'],
                                 args['lr_start_cycle'], args['cycle_period'])
        #print ('lr=%.3f' % lr)
        utils.set_learning_rate(optimizer, lr)
        lrs.append(lr)

        train_res = utils.train_epoch(device, data_loaders['train'], model,
                                      criterion, optimizer,
                                      args['num_samples_train'])

        values = [epoch + 1, lr, train_res['loss'], train_res['accuracy']]

        if (epoch + 1) >= args['lr_start_cycle'] and (
                epoch + 1) % args['cycle_period'] == 0:

            all_values['epoch'].append(epoch + 1)
            all_values['lr'].append(lr)

            all_values['tr_loss'].append(train_res['loss'])
            all_values['tr_acc'].append(train_res['accuracy'])

            val_res = utils.evaluate(device, data_loaders['val'], model,
                                     criterion, args['num_samples_val'])
            test_res = utils.evaluate(device, data_loaders['test'], model,
                                      criterion, args['num_samples_test'])

            all_values['val_loss_single'].append(val_res['loss'])
            all_values['val_acc_single'].append(val_res['accuracy'])
            all_values['test_loss_single'].append(test_res['loss'])
            all_values['test_acc_single'].append(test_res['accuracy'])

            utils.moving_average_ensemble(ensemble_model, model,
                                          1.0 / (n_models + 1))
            utils.bn_update(device, data_loaders['train_for_bn_recalc'],
                            ensemble_model)
            n_models += 1

            val_res = utils.evaluate(device, data_loaders['val'],
                                     ensemble_model, criterion,
                                     args['num_samples_val'])
            test_res = utils.evaluate(device, data_loaders['test'],
                                      ensemble_model, criterion,
                                      args['num_samples_test'])

            all_values['val_loss_ensemble'].append(val_res['loss'])
            all_values['val_acc_ensemble'].append(val_res['accuracy'])
            all_values['test_loss_ensemble'].append(test_res['loss'])
            all_values['test_acc_ensemble'].append(test_res['accuracy'])

            overall_training_time = time.time() - time_start
            all_values['overall_time'].append(overall_training_time)

        #print (epoch, 'epoch_time', time.time() - time_ep)

        overall_training_time = time.time() - time_start
        #print ('overall time', overall_training_time)

        #print (all_values)

    if len(save_model) > 0:
        torch.save(ensemble_model.state_dict(),
                   os.path.join(args['dir'], save_model + '_ensemble'))
        torch.save(model.state_dict(), os.path.join(args['dir'], save_model))

    return all_values
示例#15
0
def main(args):

    if torch.cuda.device_count() > 1:
        print("Let's use", torch.cuda.device_count(), "GPUs!")

    train_loader, test_loader = load_dataset(args.label, args.batch_size,
                                             args.half_length, args.nholes)

    if args.label == 10:
        model = ShakeResNet(args.depth, args.w_base, args.label)
    else:
        model = ShakeResNeXt(args.depth, args.w_base, args.cardinary,
                             args.label)

    model = torch.nn.DataParallel(model).cuda()

    cudnn.benckmark = True

    if args.optimizer == 'sgd':
        print("using sgd")
        opt = optim.SGD(model.parameters(),
                        lr=args.lr,
                        momentum=args.momentum,
                        weight_decay=args.weight_decay,
                        nesterov=args.nesterov)

    elif args.optimizer == 'abd':
        print("using adabound")
        opt = abd.AdaBound(model.parameters(),
                           lr=args.lr,
                           gamma=args.gamma,
                           weight_decay=args.weight_decay,
                           final_lr=args.final_lr)

    elif args.optimizer == 'swa':
        print("using swa")
        opt = optim.SGD(model.parameters(),
                        lr=args.lr,
                        momentum=args.momentum,
                        weight_decay=args.weight_decay)
        steps_per_epoch = len(train_loader.dataset) / args.batch_size
        steps_per_epoch = int(steps_per_epoch)
        opt = swa(opt,
                  swa_start=args.swa_start * steps_per_epoch,
                  swa_freq=steps_per_epoch,
                  swa_lr=args.swa_lr)
    else:
        print("not valid optimizer")
        exit

    loss_func = nn.CrossEntropyLoss().cuda()

    headers = [
        "Epoch", "LearningRate", "TrainLoss", "TestLoss", "TrainAcc.",
        "TestAcc."
    ]

    #if args.optimizer=='swa':
    #   headers = headers[:-1] + ['swa_te_loss', 'swa_te_acc'] + headers[-1:]
    #  swa_res = {'loss': None, 'accuracy': None}

    logger = utils.Logger(args.checkpoint, headers, mod=args.optimizer)

    for e in range(args.epochs):

        if args.optimizer == 'swa':
            lr = utils.schedule(e, args.optimizer, args.epochs, args.swa_start,
                                args.swa_lr, args.lr)
            utils.adjust_learning_rate(opt, lr)
        elif args.optimizer == 'sgd':
            lr = utils.cosine_lr(opt, args.lr, e, args.epochs)
        else:
            exit

        #train
        train_loss, train_acc, train_n = utils.train_epoch(
            train_loader, model, opt)
        #eval
        test_loss, test_acc, test_n = utils.eval_epoch(test_loader, model)

        logger.write(e + 1, lr, train_loss / train_n, test_loss / test_n,
                     train_acc / train_n * 100, test_acc / test_n * 100)

        if args.optimizer == 'swa' and (
                e + 1) >= args.swa_start and args.eval_freq > 1:
            if e == 0 or e % args.eval_freq == args.eval_freq - 1 or e == args.epochs - 1:
                opt.swap_swa_sgd()
                opt.bn_update(train_loaders, model, device='cuda')
                #swa_res = utils.eval_epoch(test_loaders['test'], model)
                opt.swap_swa_sgd()
示例#16
0
    model.load_state_dict(checkpoint['state_dict'])
    optimizer.load_state_dict(checkpoint['optimizer'])

columns = ['ep', 'lr', 'tr_loss', 'tr_acc', 'te_loss', 'te_acc', 'time']

utils.save_checkpoint(args.dir,
                      start_epoch,
                      state_dict=model.state_dict(),
                      optimizer=optimizer.state_dict())

for epoch in range(start_epoch, args.epochs):
    time_ep = time.time()

    train_res = utils.train_epoch(loaders['train'],
                                  model,
                                  criterion,
                                  optimizer,
                                  aug_reg=args.aug_reg)
    if epoch == 0 or epoch % args.eval_freq == args.eval_freq - 1 or epoch == args.epochs - 1:
        test_res = utils.eval(loaders['test'], model, criterion)
    else:
        test_res = {'loss': None, 'accuracy': None}

    lr = optimizer.param_groups[0]['lr']
    print("Brightness", model.aug[0].lims)
    print("Contrast", model.aug[1].lims)
    scheduler.step()

    if (epoch + 1) % args.save_freq == 0:
        utils.save_checkpoint(args.dir,
                              epoch + 1,
def train_oneshot_model(args,
                        data_loaders,
                        n_cells,
                        n_choices,
                        put_downsampling=[]):

    num_samples = utils.get_number_of_samples(args.dataset)

    device = 'cuda'

    utils.setup_torch(args.seed)

    print('Initializing model...')

    #Create a supernet skeleton (include all cell types for each position)
    propagate_weights = [[1, 1, 1] for i in range(n_cells)]
    model_class = getattr(models, 'Supernet')

    #Create the supernet model  and its SWA ensemble version
    model = model_class(num_classes=utils.get_number_of_classes(args.dataset),
                        propagate=propagate_weights,
                        training=True,
                        n_choices=n_choices,
                        put_downsampling=put_downsampling).to(device)
    ensemble_model = model_class(num_classes=utils.get_number_of_classes(
        args.dataset),
                                 propagate=propagate_weights,
                                 training=True,
                                 n_choices=n_choices,
                                 put_downsampling=put_downsampling).to(device)

    #These summaries are for verification purposes only
    #However, removing them will cause inconsistency in results since random generators are used inside them to propagate
    summary(model, (3, 32, 32), batch_size=args.batch_size, device='cuda')
    summary(ensemble_model, (3, 32, 32),
            batch_size=args.batch_size,
            device='cuda')

    criterion = torch.nn.CrossEntropyLoss().to(device)
    optimizer = torch.optim.SGD(model.parameters(),
                                lr=args.lr_init,
                                momentum=0.9,
                                weight_decay=1e-4)

    start_epoch = 0

    columns = [
        'epoch time', 'overall training time', 'epoch', 'lr', 'train_loss',
        'train_acc', 'val_loss', 'val_acc', 'test_loss', 'test_acc'
    ]

    lrs = []
    n_models = 0

    all_values = {}
    all_values['epoch'] = []
    all_values['lr'] = []

    all_values['tr_loss'] = []
    all_values['tr_acc'] = []

    all_values['val_loss'] = []
    all_values['val_acc'] = []
    all_values['test_loss'] = []
    all_values['test_acc'] = []

    n_models = 0
    print('Start training...')

    time_start = time.time()
    for epoch in range(start_epoch, args.epochs):
        time_ep = time.time()

        #lr = utils.get_cosine_annealing_lr(epoch, args.lr_init, args.epochs)
        lr = utils.get_cyclic_lr(epoch, lrs, args.lr_init, args.lr_start_cycle,
                                 args.cycle_period)
        utils.set_learning_rate(optimizer, lr)
        lrs.append(lr)

        train_res = utils.train_epoch(device, data_loaders['train'], model,
                                      criterion, optimizer,
                                      num_samples['train'])

        values = [epoch + 1, lr, train_res['loss'], train_res['accuracy']]

        if (epoch + 1) >= args.lr_start_cycle and (epoch +
                                                   1) % args.cycle_period == 0:

            all_values['epoch'].append(epoch + 1)
            all_values['lr'].append(lr)

            all_values['tr_loss'].append(train_res['loss'])
            all_values['tr_acc'].append(train_res['accuracy'])

            val_res = utils.evaluate(device, data_loaders['val'], model,
                                     criterion, num_samples['val'])
            test_res = utils.evaluate(device, data_loaders['test'], model,
                                      criterion, num_samples['test'])

            all_values['val_loss'].append(val_res['loss'])
            all_values['val_acc'].append(val_res['accuracy'])
            all_values['test_loss'].append(test_res['loss'])
            all_values['test_acc'].append(test_res['accuracy'])
            values += [
                val_res['loss'], val_res['accuracy'], test_res['loss'],
                test_res['accuracy']
            ]

            utils.moving_average_ensemble(ensemble_model, model,
                                          1.0 / (n_models + 1))
            utils.bn_update(device, data_loaders['train'], ensemble_model)
            n_models += 1

            print(all_values)

        overall_training_time = time.time() - time_start
        values = [time.time() - time_ep, overall_training_time] + values
        table = tabulate.tabulate([values],
                                  columns,
                                  tablefmt='simple',
                                  floatfmt='8.4f')
        print(table)

    print('Training finished. Saving final nets...')
    utils.save_result(all_values, args.dir, 'model_supernet')

    torch.save(model.state_dict(), args.dir + '/supernet.pth')
    torch.save(ensemble_model.state_dict(), args.dir + '/supernet_swa.pth')
示例#18
0
total_steps = len(train_data_loader) * EPOCHS
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,
    num_training_steps=total_steps
)

loss = WeightAdjustingLoss().to(device)

for epoch in range(EPOCHS):
    start_time = time.time()
    train_acc, train_loss = train_epoch(
        model,
        train_data_loader,
        loss,
        optimizer,
        device,
        scheduler,
        6217
    )
    val_acc, val_loss = eval_model(
        model,
        val_data_loader,
        loss,
        device,
        777
    )
    end_time = time.time()
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    print(f'Epoch::{epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'Train Loss {train_loss} accuracy {train_acc}')
示例#19
0
文件: main.py 项目: MIPT-Oulu/JS2
                          momentum=0.9,
                          nesterov=True)

    best_acc = 0.0
    best_epoch = None

    fpr = []
    tpr = []

    for epoch in range(MAX_EPOCH):

        optimizer, lr = utils.adjust_learning_rate(optimizer, epoch, 0.00001,
                                                   0.01, 8)

        # Training
        train_loss = utils.train_epoch(epoch, net, optimizer, train_dataloader,
                                       criterion)
        # Validating
        val_loss, preds, truth = utils.validate_epoch(net, test_dataloader,
                                                      criterion)

        auc_val = roc_auc_score(truth, preds)
        print(epoch + 1, train_loss, val_loss, auc_val)

        if auc_val > best_acc:
            best_acc = auc_val
            best_epoch = epoch
            fpr, tpr, thresholds = roc_curve(truth, preds)

    plt.plot(fpr, tpr, lw=2, alpha=0.8, label='ROC (AUC = %0.2f)' % (best_acc))
    plt.show()
示例#20
0
        if swa_n_ckpt is not None:
            swa_n = swa_n_ckpt

criterion = F.cross_entropy

columns = ['ep', 'lr', 'tr_loss', 'tr_acc', 'te_loss', 'te_acc', 'time']
if args.swa:
    columns = columns[:-1] + ['swa_te_loss', 'swa_te_acc'] + columns[-1:]
    swa_res = {'loss': None, 'accuracy': None}

for epoch in range(start_epoch, args.ine_start):
    time_ep = time.time()

    lr = schedule_swa(epoch)
    utils.adjust_learning_rate(optimizer, lr)
    train_res = utils.train_epoch(loaders['train'], model, criterion,
                                  optimizer, 'swa', args)
    if epoch == 0 or epoch % args.eval_freq == args.eval_freq - 1 or epoch == args.epochs - 1:
        test_res = utils.eval(loaders['test'], model, criterion, 'swa')
    else:
        test_res = {'loss': None, 'accuracy': None}

    if args.swa and (epoch + 1) >= args.swa_start and (
            epoch + 1 - args.swa_start) % args.swa_c_epochs == 0:
        utils.moving_average(swa_model, model, 1.0 / (swa_n + 1))
        swa_n += 1
        if epoch == 0 or epoch % args.eval_freq == args.eval_freq - 1 or epoch == args.epochs - 1:
            utils.bn_update(loaders['train'], swa_model)
            swa_res = utils.eval(loaders['test'], swa_model, criterion, 'swa')
        else:
            swa_res = {'loss': None, 'accuracy': None}
示例#21
0
    state_dict=model.state_dict(),
    optimizer=optimizer.state_dict()
)

for epoch in range(start_epoch, args.epochs):
    time_ep = time.time()
    #memory_prior = torch.cuda.memory_allocated()

    if not args.no_schedule:
        lr = schedule(epoch)
        utils.adjust_learning_rate(optimizer, lr)
    else:
        lr = args.lr_init
    
    if (args.swa and (epoch + 1) > args.swa_start) and args.cov_mat:
        model_batch_means, train_res = utils.train_epoch(loaders['train'], model, criterion, optimizer, batch_means=True)
    else:
        model_batch_means = None
        train_res = utils.train_epoch(loaders['train'], model, criterion, optimizer)

    if epoch == 0 or epoch % args.eval_freq == args.eval_freq - 1 or epoch == args.epochs - 1:
        test_res = utils.eval(loaders['test'], model, criterion)
    else:
        test_res = {'loss': None, 'accuracy': None}

    if args.swa and (epoch + 1) > args.swa_start and (epoch + 1 - args.swa_start) % args.swa_c_epochs == 0:
        swag_model.collect_model(model, bm=model_batch_means)
        del model_batch_means
        if epoch == 0 or epoch % args.eval_freq == args.eval_freq - 1 or epoch == args.epochs - 1:
            swag_model.sample(0.0)
            utils.bn_update(loaders['train'], swag_model)
def main(args):
    os.makedirs("./saved-outputs/", exist_ok=True)

    transform_train = transforms.Compose([
        transforms.RandomHorizontalFlip(),
        transforms.RandomCrop(32, padding=4),
        transforms.ToTensor(),
        transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))
    ])

    transform_test = transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))
    ])
    dataset = torchvision.datasets.CIFAR100(args.data_path,
                                            train=True,
                                            download=False,
                                            transform=transform_train)
    trainloader = DataLoader(dataset, shuffle=True, batch_size=args.batch_size)
    testset = torchvision.datasets.CIFAR100(args.data_path,
                                            train=False,
                                            download=False,
                                            transform=transform_test)
    testloader = DataLoader(testset, shuffle=True, batch_size=args.batch_size)

    model = VGG16(100)
    model = model.cuda()
    optimizer = torch.optim.SGD(model.parameters(),
                                lr=args.lr_init,
                                momentum=0.9,
                                weight_decay=args.wd)
    scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer,
                                                           T_max=args.epochs)
    criterion = torch.nn.CrossEntropyLoss()
    columns = ['ep', 'lr', 'tr_loss', 'tr_acc', 'te_loss', 'te_acc', 'time']
    for epoch in range(args.epochs):
        time_ep = time.time()
        train_res = utils.train_epoch(trainloader, model, criterion, optimizer)

        if epoch == 0 or epoch % args.eval_freq == args.eval_freq - 1 or epoch == args.epochs - 1:
            test_res = utils.eval(testloader, model, criterion)
        else:
            test_res = {'loss': None, 'accuracy': None}

        time_ep = time.time() - time_ep

        lr = optimizer.param_groups[0]['lr']
        scheduler.step()

        values = [
            epoch + 1, lr, train_res['loss'], train_res['accuracy'],
            test_res['loss'], test_res['accuracy'], time_ep
        ]

        table = tabulate.tabulate([values],
                                  columns,
                                  tablefmt='simple',
                                  floatfmt='8.4f')
        if epoch % 40 == 0:
            table = table.split('\n')
            table = '\n'.join([table[1]] + table)
        else:
            table = table.split('\n')[2]
        print(table, flush=True)

    checkpoint = model.state_dict()
    trial_num = len(glob.glob("./saved-outputs/model_*"))
    savedir = "./saved-outputs/model_" +\
               str(trial_num) + "/"
    os.makedirs(savedir, exist_ok=True)
    torch.save(checkpoint, savedir + "base_model.pt")
示例#23
0
                                            step_size=1,
                                            gamma=lr_decay)
loss_fun = torch.nn.MSELoss()

min_mse = 100
train_mse = []
valid_mse = []
test_mse = []

for i in range(n_epochs):
    start = time.time()
    scheduler.step()

    model.train()
    # use train_epoch_scale/eval_epoch_scale for training scale equivariant models
    train_mse.append(train_epoch(train_loader, model, optimizer, loss_fun))
    model.eval()
    mse, _, _ = eval_epoch(valid_loader, model, loss_fun)
    valid_mse.append(mse)

    if valid_mse[-1] < min_mse:
        min_mse = valid_mse[-1]
        best_model = model
        torch.save(best_model, save_name + ".pth")
    end = time.time()

    # Early Stopping but train at least for 50 epochs
    if (len(train_mse) > 50
            and np.mean(valid_mse[-5:]) >= np.mean(valid_mse[-10:-5])):
        break
    print(i + 1, train_mse[-1], valid_mse[-1], round((end - start) / 60, 5),
cost = stat['embedding'] + 0.
#stat['cp'][0] = ot.emd(ps, pt, stat['embedding'] ) + 0.
stat['cp'][0] = ot.optim.cg(ps, pt, cost, reg, f, df, verbose=True)

###display initial guess couplings
ot.plot.plot1D_mat(ps, pt, stat['cp'][0], 'OT matrix Entrop. reg')

# In[9]:
# pre train model on source task and save the model
network = CNN().to(stat['dev'])
optimizer = optim.SGD(network.parameters(),
                      lr=1e-3,
                      momentum=0.9,
                      weight_decay=stat['weight_decay'])
for epoch in range(30):
    train_epoch(network, stat, optimizer)
    if (epoch + 1) % 5 == 0:
        test(stat, network)
####saving pretrained model
torch.save(network.state_dict(),
           os.path.join(MNIST_tran_ini, 'CNN={}.pth'.format('animal')))

# In[10]:
####loading pre-trained model
network = CNN().to(stat['dev'])
network.load_state_dict(
    torch.load(os.path.join(MNIST_tran_ini, 'CNN={}.pth'.format('animal'))))

optimizer = optim.SGD(network.parameters(),
                      lr=1e-3,
                      momentum=0.9,
示例#25
0
start_epoch = 0

# Prepare logging
columns = ['ep', 'lr', 'tr_loss', 'tr_acc', 'te_loss', 'te_acc', 'time']

for epoch in range(start_epoch, args.epochs):
    time_ep = time.time()
    lr = schedule(epoch)
    grad_quantizer = lambda x: wage_qtorch.QG(
        x, args.wl_grad, args.wl_rand, lr, mode=args.grad_rounding)

    train_res = utils.train_epoch(loaders['train'],
                                  model,
                                  criterion,
                                  weight_quantizer,
                                  grad_quantizer,
                                  epoch,
                                  wage_quantize=True,
                                  wage_grad_clip=grad_clip)

    # Validation
    test_res = utils.eval(loaders['test'], model, criterion, weight_quantizer)

    time_ep = time.time() - time_ep
    values = [
        epoch + 1, lr, train_res['loss'], train_res['accuracy'],
        test_res['loss'], test_res['accuracy'], time_ep
    ]

    table = tabulate.tabulate([values],
                              columns,
def main(device=torch.device('cuda:0')):
    # CLI arguments
    parser = arg.ArgumentParser(
        description='We all know what we are doing. Fighting!')
    parser.add_argument("--datasize",
                        "-d",
                        default="small",
                        type=str,
                        help="data size you want to use, small, medium, total")
    # Parsing
    args = parser.parse_args()
    # Data loaders
    datasize = args.datasize
    pathname = "data/nyu.zip"
    tr_loader, va_loader, te_loader = getTrainingValidationTestingData(
        datasize, pathname, batch_size=config("unet.batch_size"))

    # Model
    model = Net()

    # TODO: define loss function, and optimizer
    learning_rate = utils.config("unet.learning_rate")
    criterion = DepthLoss(0.1)
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
    number_of_epoches = 10
    #

    # print("Number of float-valued parameters:", util.count_parameters(model))

    # Attempts to restore the latest checkpoint if exists
    print("Loading unet...")
    model, start_epoch, stats = utils.restore_checkpoint(
        model, utils.config("unet.checkpoint"))

    # axes = utils.make_training_plot()

    # Evaluate the randomly initialized model
    # evaluate_epoch(
    #     axes, tr_loader, va_loader, te_loader, model, criterion, start_epoch, stats
    # )
    # loss = criterion()

    # initial val loss for early stopping
    # prev_val_loss = stats[0][1]

    running_va_loss = []
    running_va_acc = []
    running_tr_loss = []
    running_tr_acc = []
    # TODO: define patience for early stopping
    # patience = 1
    # curr_patience = 0
    #
    tr_acc, tr_loss = utils.evaluate_model(model, tr_loader, device)
    acc, loss = utils.evaluate_model(model, va_loader, device)
    running_va_acc.append(acc)
    running_va_loss.append(loss)
    running_tr_acc.append(tr_acc)
    running_tr_loss.append(tr_loss)

    # Loop over the entire dataset multiple times
    # for epoch in range(start_epoch, config('cnn.num_epochs')):
    epoch = start_epoch
    # while curr_patience < patience:
    while epoch < number_of_epoches:
        # Train model
        utils.train_epoch(tr_loader, model, criterion, optimizer)
        tr_acc, tr_loss = utils.evaluate_model(model, tr_loader, device)
        va_acc, va_loss = utils.evaluate_model(model, va_loader, device)
        running_va_acc.append(va_acc)
        running_va_loss.append(va_loss)
        running_tr_acc.append(tr_acc)
        running_tr_loss.append(tr_loss)
        # Evaluate model
        # evaluate_epoch(
        #     axes, tr_loader, va_loader, te_loader, model, criterion, epoch + 1, stats
        # )

        # Save model parameters
        utils.save_checkpoint(model, epoch + 1,
                              utils.config("unet.checkpoint"), stats)

        # update early stopping parameters
        """
        curr_patience, prev_val_loss = early_stopping(
            stats, curr_patience, prev_val_loss
        )
        """

        epoch += 1
    print("Finished Training")
    # Save figure and keep plot open
    # utils.save_training_plot()
    # utils.hold_training_plot()
    utils.make_plot(running_tr_loss, running_tr_acc, running_va_loss,
                    running_va_acc)
示例#27
0
文件: train.py 项目: lonestar686/swa
def main():

    ds = getattr(torchvision.datasets, args.dataset)
    path = os.path.join(args.data_path, args.dataset.lower())
    train_set = ds(path,
                   train=True,
                   download=True,
                   transform=model_cfg.transform_train)
    test_set = ds(path,
                  train=False,
                  download=True,
                  transform=model_cfg.transform_test)
    loaders = {
        'train':
        torch.utils.data.DataLoader(train_set,
                                    batch_size=args.batch_size,
                                    shuffle=True,
                                    num_workers=args.num_workers,
                                    pin_memory=True),
        'test':
        torch.utils.data.DataLoader(test_set,
                                    batch_size=args.batch_size,
                                    shuffle=False,
                                    num_workers=args.num_workers,
                                    pin_memory=True)
    }
    num_classes = len(train_set.classes)  # max(train_set.train_labels) + 1
    print(num_classes)

    print('Preparing model')
    model = model_cfg.base(*model_cfg.args,
                           num_classes=num_classes,
                           **model_cfg.kwargs)
    model.cuda()

    if args.swa:
        print('SWA training')
        swa_model = model_cfg.base(*model_cfg.args,
                                   num_classes=num_classes,
                                   **model_cfg.kwargs)
        swa_model.cuda()
        swa_n = 0
    else:
        print('SGD training')

    def schedule(epoch):
        t = (epoch) / (args.swa_start if args.swa else args.epochs)
        lr_ratio = args.swa_lr / args.lr_init if args.swa else 0.01
        if t <= 0.5:
            factor = 1.0
        elif t <= 0.9:
            factor = 1.0 - (1.0 - lr_ratio) * (t - 0.5) / 0.4
        else:
            factor = lr_ratio
        return args.lr_init * factor

    criterion = F.cross_entropy
    optimizer = torch.optim.SGD(model.parameters(),
                                lr=args.lr_init,
                                momentum=args.momentum,
                                weight_decay=args.wd)

    start_epoch = 0
    if args.resume is not None:
        print('Resume training from %s' % args.resume)
        checkpoint = torch.load(args.resume)
        start_epoch = checkpoint['epoch']
        model.load_state_dict(checkpoint['state_dict'])
        optimizer.load_state_dict(checkpoint['optimizer'])
        if args.swa:
            swa_state_dict = checkpoint['swa_state_dict']
            if swa_state_dict is not None:
                swa_model.load_state_dict(swa_state_dict)
            swa_n_ckpt = checkpoint['swa_n']
            if swa_n_ckpt is not None:
                swa_n = swa_n_ckpt

    columns = ['ep', 'lr', 'tr_loss', 'tr_acc', 'te_loss', 'te_acc', 'time']
    if args.swa:
        columns = columns[:-1] + ['swa_te_loss', 'swa_te_acc'] + columns[-1:]
        swa_res = {'loss': None, 'accuracy': None}

    utils.save_checkpoint(
        args.dir,
        start_epoch,
        state_dict=model.state_dict(),
        swa_state_dict=swa_model.state_dict() if args.swa else None,
        swa_n=swa_n if args.swa else None,
        optimizer=optimizer.state_dict())

    for epoch in range(start_epoch, args.epochs):
        time_ep = time.time()

        lr = schedule(epoch)
        utils.adjust_learning_rate(optimizer, lr)
        train_res = utils.train_epoch(loaders['train'], model, criterion,
                                      optimizer)
        if epoch == 0 or epoch % args.eval_freq == args.eval_freq - 1 or epoch == args.epochs - 1:
            test_res = utils.eval(loaders['test'], model, criterion)
        else:
            test_res = {'loss': None, 'accuracy': None}

        if args.swa and (epoch + 1) >= args.swa_start and (
                epoch + 1 - args.swa_start) % args.swa_c_epochs == 0:
            utils.moving_average(swa_model, model, 1.0 / (swa_n + 1))
            swa_n += 1
            if epoch == 0 or epoch % args.eval_freq == args.eval_freq - 1 or epoch == args.epochs - 1:
                utils.bn_update(loaders['train'], swa_model)
                swa_res = utils.eval(loaders['test'], swa_model, criterion)
            else:
                swa_res = {'loss': None, 'accuracy': None}

        if (epoch + 1) % args.save_freq == 0:
            utils.save_checkpoint(
                args.dir,
                epoch + 1,
                state_dict=model.state_dict(),
                swa_state_dict=swa_model.state_dict() if args.swa else None,
                swa_n=swa_n if args.swa else None,
                optimizer=optimizer.state_dict())

        time_ep = time.time() - time_ep
        values = [
            epoch + 1, lr, train_res['loss'], train_res['accuracy'],
            test_res['loss'], test_res['accuracy'], time_ep
        ]
        if args.swa:
            values = values[:-1] + [swa_res['loss'], swa_res['accuracy']
                                    ] + values[-1:]
        table = tabulate.tabulate([values],
                                  columns,
                                  tablefmt='simple',
                                  floatfmt='8.4f')
        if epoch % 40 == 0:
            table = table.split('\n')
            table = '\n'.join([table[1]] + table)
        else:
            table = table.split('\n')[2]
        print(table)

    if args.epochs % args.save_freq != 0:
        utils.save_checkpoint(
            args.dir,
            args.epochs,
            state_dict=model.state_dict(),
            swa_state_dict=swa_model.state_dict() if args.swa else None,
            swa_n=swa_n if args.swa else None,
            optimizer=optimizer.state_dict())