Exemplo n.º 1
0
def train(trainloader,
          model,
          criterion,
          optimizer,
          epoch,
          cuda=False,
          compute_step_variance=False):
    # switch to train mode
    model.train()

    batch_time = AverageMeter()
    data_time = AverageMeter()
    losses = AverageMeter()
    top1 = AverageMeter()
    top5 = AverageMeter()
    end = time.time()

    for batch_idx, (inputs, targets) in enumerate(trainloader):
        # measure data loading time
        data_time.update(time.time() - end)

        # found this that suggest changing `async` to `non_blocking`: https://github.com/quark0/darts/pull/25
        if cuda:
            inputs, targets = inputs.cuda(), targets.cuda(non_blocking=True)

        # compute output
        outputs = model(inputs)
        loss = criterion(outputs, targets)

        # measure accuracy and record loss
        prec1, prec5 = accuracy(outputs.data, targets.data, topk=(1, 5))
        losses.update(loss.item(), inputs.size(0))
        top1.update(prec1.item(), inputs.size(0))
        top5.update(prec5.item(), inputs.size(0))

        # compute gradient and do SGD step
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # measure elapsed time
        batch_time.update(time.time() - end)
        end = time.time()

        # plot progress
        progress_str = 'Loss: %.3f | Acc: %.3f%% (%d/%d)'\
            % (losses.avg, top1.avg, top1.sum, top1.count)
        progress_bar(batch_idx, len(trainloader), progress_str)

        iteration = epoch * len(trainloader) + batch_idx

        track.metric(iteration=iteration,
                     epoch=epoch,
                     avg_train_loss=losses.avg,
                     avg_train_acc=top1.avg,
                     cur_train_loss=loss.item(),
                     cur_train_acc=prec1.item())
    return (losses.avg, top1.avg)
Exemplo n.º 2
0
def eval_model(model, env, y_placeholder, obs_placeholder, attack_method,
               attack_ord=2, num_rollouts=5, eps=0.1,
               trial_num=0, render=False, alg_name='ERROR', env_name='ERROR'):
    # cleverhans needs to get the logits tensor, but expects you to run
    # through and recompute it for the given observation
    # even though the graph is already created
    cleverhans_model = CallableModelWrapper(lambda o: y_placeholder, "logits")
    attack = ATTACKS[attack_method](cleverhans_model)

    fgsm_params = {'eps': eps, 'ord': attack_ord}

    # we'll keep tracking metrics here
    prev_done_step = 0
    stats = {}
    rewards = []

    stats['eval_step'] = 0
    stats['episode'] = 0
    stats['episode_reward'] = 0.

    obs = env.reset()
    num_episodes = 0
    while num_episodes < num_rollouts:
        # the attack_op tensor will generate the perturbed state!
        attack_op = attack.generate(obs_placeholder, **fgsm_params)
        adv_obs = attack_op.eval({obs_placeholder: obs[None, :]})
        action = model(adv_obs)[0]

        # it's time for my child to act out in this adversarial world
        obs, rew, done, _ = env.step(action)
        reward = rew[0] if isinstance(env, VecEnv) else rew
        if render:
            env.render()
        done = done.any() if isinstance(done, np.ndarray) else done

        # let's get our metrics
        stats['eval_step'] += 1
        stats['episode_reward'] += reward
        stats['episode_len'] = stats['eval_step'] + prev_done_step

        if done:
            rewards.append(stats['episode_reward'])
            obs = env.reset()
            prev_done_step = stats['eval_step']
            track.debug("Finished episode %d!" % (stats['episode']))
            stats['episode'] += 1
            stats['episode_reward'] = 0
            stats['eval_step'] = 0
            num_episodes += 1
        # track metrics to access later through pandas
        track.metric(iteration=stats['eval_step'] + prev_done_step,
                     trial_num=trial_num,
                     **stats)

    env.close()
    np.save('./data/{0}_{1}_{2}_{3}_{4}.npy'.format(alg_name, env_name, attack_method, attack_ord, eps), rewards)
    print('REWARDS', rewards)
    return stats  # gimme the final stats for the episode
Exemplo n.º 3
0
def run(ensemble, proj_df, results_dir='./logs', dataroot='./data',
        batch_size=128, eval_batch_size=100, cuda=False, num_workers=2,
        **unused):
    """
    this evaluates both the ensemble and the baseline model on the full
    test set

    we also evaluate each model and compute their individual losses, so that
    we can plot the variance around the ensemble's dashed horizontal line
        (see top of file)
    """
    trainloader, testloader = build_dataset('cifar10',
                                            dataroot=dataroot,
                                            batch_size=batch_size,
                                            eval_batch_size=eval_batch_size,
                                            num_workers=2)
    ensemble_criterion = SoftmaxNLL()
    track.debug("[baseline] testing the ensemble on full dataset")
    ensemble_loss, ensemble_acc = test(testloader, ensemble,
                                       ensemble_criterion, epoch=-1,
                                       cuda=cuda, metric=False)

    # get the no-noise baseline evaluation
    proj = track.Project(results_dir)
    best_model, best_df = load_trial(proj, noise_scale=0.0)

    track.debug("[baseline] testing no-noise baseline model on full dataset")
    baseline_criterion = torch.nn.CrossEntropyLoss()
    baseline_loss, baseline_acc = test(testloader, best_model,
                                       baseline_criterion,
                                       epoch=-1, cuda=cuda, metric=False)

    # now, test each of the ensemble's models
    model_losses = []
    model_accs = []
    track.debug("[baseline] testing individual models on full dataset")
    for i, model in enumerate(ensemble.models):
        track.debug("[baseline] testing model %d of %d" %
                    (i, len(ensemble.models)))
        model_loss, model_acc = test(testloader, model,
                                     baseline_criterion,
                                     epoch=-1, cuda=cuda, metric=False)
        model_losses.append(model_loss)
        model_accs.append(model_acc)

    # we just need to track the scalar results of this evaluation
    # we can access the baseline test *curve* from the jupyter notebook (later)
    track.metric(iteration=0, ensemble_loss=ensemble_loss,
                 ensemble_acc=ensemble_acc,
                 best_baseline_loss=baseline_loss,
                 best_baseline_acc=baseline_acc,
                 model_losses=model_losses,
                 model_accs=model_accs)
Exemplo n.º 4
0
def do_training(args):
    trainloader, testloader = build_dataset(
        args.dataset,
        dataroot=args.dataroot,
        batch_size=args.batch_size,
        eval_batch_size=args.eval_batch_size,
        num_workers=2)
    model = build_model(args.arch, num_classes=num_classes(args.dataset))
    if args.cuda:
        model = torch.nn.DataParallel(model).cuda()

    # Calculate total number of model parameters
    num_params = sum(p.numel() for p in model.parameters())
    track.metric(iteration=0, num_params=num_params)

    if args.optimizer == 'sgd':
        optimizer = SGD(model.parameters(),
                        lr=args.lr,
                        momentum=args.momentum,
                        weight_decay=args.weight_decay)
    else:
        optimizer = EKFAC(model.parameters(),
                          lr=args.lr,
                          momentum=args.momentum,
                          weight_decay=args.weight_decay,
                          eps=args.eps,
                          update_freq=args.update_freq)

    criterion = torch.nn.CrossEntropyLoss()

    best_acc = 0.0
    for epoch in range(args.epochs):
        track.debug("Starting epoch %d" % epoch)
        args.lr = adjust_learning_rate(epoch, optimizer, args.lr,
                                       args.schedule, args.gamma)
        train_loss, train_acc = train(trainloader, model, criterion, optimizer,
                                      epoch, args.cuda)
        test_loss, test_acc = test(testloader, model, criterion, epoch,
                                   args.cuda)
        track.debug('Finished epoch %d... | train loss %.3f | train acc %.3f '
                    '| test loss %.3f | test acc %.3f' %
                    (epoch, train_loss, train_acc, test_loss, test_acc))
        # Save model
        model_fname = os.path.join(track.trial_dir(),
                                   "model{}.ckpt".format(epoch))
        torch.save(model, model_fname)
        if test_acc > best_acc:
            best_acc = test_acc
            best_fname = os.path.join(track.trial_dir(), "best.ckpt")
            track.debug("New best score! Saving model")
            torch.save(model, best_fname)
Exemplo n.º 5
0
def train(trainloader, model, criterion, optimizer, epoch):
    # switch to train mode
    model.train()

    batch_time = AverageMeter()
    data_time = AverageMeter()
    losses = AverageMeter()
    top1 = AverageMeter()
    top5 = AverageMeter()
    end = time.time()

    for batch_idx, (inputs, targets) in enumerate(trainloader):
        # measure data loading time
        data_time.update(time.time() - end)

        inputs, targets = inputs.cuda(), targets.cuda(async=True)
        inputs, targets = torch.autograd.Variable(
            inputs), torch.autograd.Variable(targets)

        # compute output
        outputs = model(inputs)
        loss = criterion(outputs, targets)

        # measure accuracy and record loss
        prec1, prec5 = accuracy(outputs.data, targets.data, topk=(1, 5))
        losses.update(loss.item(), inputs.size(0))
        top1.update(prec1.item(), inputs.size(0))
        top5.update(prec5.item(), inputs.size(0))

        # compute gradient and do SGD step
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # measure elapsed time
        batch_time.update(time.time() - end)
        end = time.time()

        # plot progress
        progress_str = 'Loss: %.3f | Acc: %.3f%% (%d/%d)'\
            % (losses.avg, top1.avg, top1.sum, top1.count)
        progress_bar(batch_idx, len(trainloader), progress_str)

        iteration = epoch * len(trainloader) + batch_idx
        track.metric(iteration=iteration,
                     epoch=epoch,
                     avg_train_loss=losses.avg,
                     avg_train_acc=top1.avg,
                     cur_train_loss=loss.item(),
                     cur_train_acc=prec1.item())
    return (losses.avg, top1.avg)
Exemplo n.º 6
0
def do_training(args):
    trainloader, testloader = build_dataset(
        args.dataset,
        dataroot=args.dataroot,
        batch_size=args.batch_size,
        eval_batch_size=args.eval_batch_size,
        num_workers=2)
    model = build_model(args.arch, num_classes=num_classes(args.dataset))
    if args.cuda:
        model = torch.nn.DataParallel(model).cuda()

    # Calculate total number of model parameters
    num_params = sum(p.numel() for p in model.parameters())
    track.metric(iteration=0, num_params=num_params)

    num_chunks = max(1, args.batch_size // args.max_samples_per_gpu)

    optimizer = LARS(params=model.parameters(),
                     lr=args.lr,
                     momentum=args.momentum,
                     weight_decay=args.weight_decay,
                     eta=args.eta,
                     max_epoch=args.epochs)

    criterion = torch.nn.CrossEntropyLoss()

    best_acc = 0.0
    for epoch in range(args.epochs):
        track.debug("Starting epoch %d" % epoch)
        train_loss, train_acc = train(trainloader,
                                      model,
                                      criterion,
                                      optimizer,
                                      epoch,
                                      args.cuda,
                                      num_chunks=num_chunks)
        test_loss, test_acc = test(testloader, model, criterion, epoch,
                                   args.cuda)
        track.debug('Finished epoch %d... | train loss %.3f | train acc %.3f '
                    '| test loss %.3f | test acc %.3f' %
                    (epoch, train_loss, train_acc, test_loss, test_acc))
        # Save model
        model_fname = os.path.join(track.trial_dir(),
                                   "model{}.ckpt".format(epoch))
        torch.save(model, model_fname)
        if test_acc > best_acc:
            best_acc = test_acc
            best_fname = os.path.join(track.trial_dir(), "best.ckpt")
            track.debug("New best score! Saving model")
            torch.save(model, best_fname)
Exemplo n.º 7
0
def test(testloader, model, criterion, epoch, cuda=False):
    batch_time = AverageMeter()
    data_time = AverageMeter()
    losses = AverageMeter()
    top1 = AverageMeter()
    top5 = AverageMeter()

    # switch to evaluate mode
    model.eval()

    end = time.time()
    with torch.no_grad():
        for batch_idx, (inputs, targets) in enumerate(testloader):
            # measure data loading time
            data_time.update(time.time() - end)

            if cuda:
                inputs, targets = inputs.cuda(), targets.cuda()
            inputs = torch.autograd.Variable(inputs, volatile=True)
            targets = torch.autograd.Variable(targets, volatile=True)

            # compute output
            outputs = model(inputs)
            loss = criterion(outputs, targets)

            # measure accuracy and record loss
            prec1, prec5 = accuracy(outputs.data, targets.data, topk=(1, 5))
            losses.update(loss.item(), inputs.size(0))
            top1.update(prec1.item(), inputs.size(0))
            top5.update(prec5.item(), inputs.size(0))

            # measure elapsed time
            batch_time.update(time.time() - end)
            end = time.time()

            # plot progress
            progress_str = 'Loss: %.3f | Acc: %.3f%% (%d/%d)'\
                % (losses.avg, top1.avg, top1.sum, top1.count)
            progress_bar(batch_idx, len(testloader), progress_str)
    track.metric(iteration=0,
                 epoch=epoch,
                 avg_test_loss=losses.avg,
                 avg_test_acc=top1.avg)
    return (losses.avg, top1.avg)
Exemplo n.º 8
0
def main(args):
    trainloader, testloader = build_dataset(
        'cifar10',
        dataroot=args.dataroot,
        batch_size=args.batch_size,
        eval_batch_size=args.eval_batch_size,
        num_workers=2)
    model = build_model('ResNet18', num_classes=10)
    criterion = torch.nn.CrossEntropyLoss()
    eigenvals, eigenvecs = compute_hessian_eigenthings(model,
                                                       testloader,
                                                       criterion,
                                                       args.num_eigenthings,
                                                       args.num_steps,
                                                       momentum=args.momentum,
                                                       use_gpu=args.cuda)
    print("Eigenvecs:")
    print(eigenvecs)
    print("Eigenvals:")
    print(eigenvals)
    track.metric(iteration=0, eigenvals=eigenvals)
Exemplo n.º 9
0
def run(ensemble,
        proj_df,
        dataroot='./data',
        batch_size=128,
        eval_batch_size=100,
        cuda=False,
        num_workers=2,
        **unused):
    """ let's compute that entropy baby """
    num_classes = 10  # build_dataset('cifar10') <- not worth computing rn
    entropy_criterion = Entropy()
    ensemble.models = ensemble.models[::4]

    # iterate for all possible classes in dataset
    for class_ind in range(num_classes):
        # build dataset per class
        track.debug("Evaluating entropy for class id: %d" % (class_ind))
        class_trainlaoder, class_testloader = build_single_class_dataset(
            'cifar10',
            class_ind=class_ind,
            dataroot=dataroot,
            batch_size=batch_size,
            eval_batch_size=eval_batch_size,
            num_workers=2)

        # compute the entropy of the model post-hoc as well
        entropy = test(class_testloader,
                       ensemble,
                       entropy_criterion,
                       epoch=-1,
                       cuda=cuda,
                       metric=False,
                       criterion_has_labels=False,
                       compute_acc=False)

        track.debug("\n\n\tEntropy: %.2f" % entropy)
        track.metric(cifar_class_id=class_ind, entropy=entropy)
Exemplo n.º 10
0
def _do_training(train, val, shared, training_state):
    batch_size = flags.FLAGS.batch_size

    loss_window = RollingAverageWindow(len(train) // 10 // batch_size)
    acc_window = RollingAverageWindow(len(train) // 10 // batch_size)
    grad_window = RollingAverageWindow(len(train) // 10 // batch_size)

    def _tqdm_postfix():
        return {
            'loss': '{:06.3f}'.format(loss_window.value()),
            'acc': '{:05.1%}'.format(acc_window.value()),
            'gradnorm': '{:08.2e}'.format(grad_window.value())
        }

    shared.set_mode(evaluation=False)
    shared.lr(training_state.lr)
    perm = np.arange(len(train))

    for epoch in range(1 + training_state.epoch, 1 + flags.FLAGS.max_epochs):
        epochfmt = intfmt(flags.FLAGS.max_epochs)
        training_state.epoch = epoch
        track.debug('begin epoch ' + epochfmt, epoch)
        # one sample at a time greatly simplifies pytorch seq2seq!
        np.random.shuffle(perm)

        samples = (train[i] for i in perm)
        with tqdm(total=len(train), postfix=_tqdm_postfix()) as progbar:
            for exs in chunkify(samples, batch_size):
                shared.zero_grad()
                loss, acc, gradnorm = shared.train(exs)
                loss_window.update(loss)
                acc_window.update(acc)
                grad_window.update(gradnorm)
                shared.step()
                progbar.update(len(exs))
                progbar.set_postfix(**_tqdm_postfix())

        shared.set_mode(evaluation=True)
        val_diagnostics = _diagnose(val, shared)
        train_diagnostics = _diagnose(train, shared, min(len(val), len(train)))
        track.metric(iteration=epoch, lr=training_state.lr)
        track.metric(iteration=epoch,
                     **{'val ' + k: v
                        for k, v in val_diagnostics.items()})
        track.metric(iteration=epoch,
                     **{'train ' + k: v
                        for k, v in train_diagnostics.items()})
        shared.set_mode(evaluation=False)
        val_diagnostics_str = _str_diagnostics('val', val_diagnostics)
        train_diagnositcs_str = _str_diagnostics('(sampled) train',
                                                 train_diagnostics)
        track.debug('epoch ' + epochfmt + ' of ' + epochfmt + '\n{}\n{}',
                    epoch, flags.FLAGS.max_epochs, val_diagnostics_str,
                    train_diagnositcs_str)

        cur_val_loss = val_diagnostics['loss (*total)']
        if cur_val_loss < training_state.best_val_loss:
            training_state.patience = training_state.initial_patience
            training_state.best_val_loss = cur_val_loss
            best_file = _checkpoint_file('best.pth')
            track.debug('updating best model into file {}', best_file)
            _save_checkpoint(best_file, shared.model, training_state)
        else:
            training_state.patience -= 1
            track.debug('val loss not improving; dropping patience')
            shared.lr(training_state.lr)

        if training_state.patience == 0:
            track.debug('out of patience, dropping lr')
            training_state.lr *= flags.FLAGS.lr_decay_rate
            training_state.patience = training_state.initial_patience

        track.debug('lr {} patience {} best val loss so far {}',
                    training_state.lr, training_state.patience,
                    training_state.best_val_loss)

        early_stop = training_state.lr < flags.FLAGS.min_lr
        if early_stop:
            track.debug(
                'lr dropped to {} < min tolerable lr {}, early stopping',
                training_state.lr, flags.FLAGS.min_lr)

        if _check_period(epoch, flags.FLAGS.persist_every) or early_stop:
            epochfmt = intfmt(flags.FLAGS.max_epochs, fill='0')
            checkpoint_file = _checkpoint_file(epochfmt.format(epoch) + '.pth')
            track.debug('persisting model to {}', checkpoint_file)
            _save_checkpoint(checkpoint_file, shared.model, training_state)

        if early_stop:
            break
Exemplo n.º 11
0
with track.trial(args.logroot, None, param_map=param_map):
    try:
        for epoch in range(1, args.epochs + 1):
            epoch_start_time = time.time()
            train_loss = train()
            val_loss = evaluate(val_data)
            print('-' * 89)
            track.debug(
                '| end of epoch {:3d} | time: {:5.2f}s | train loss {:5.2f} | valid loss {:5.2f} | '
                'valid ppl {:8.2f}'.format(epoch,
                                           (time.time() - epoch_start_time),
                                           train_loss, val_loss,
                                           math.exp(val_loss)))
            print('-' * 89)
            track.metric(iteration=epoch,
                         train_loss=train_loss,
                         test_loss=val_loss)
            # Log model
            model_fname = os.path.join(track.trial_dir(),
                                       "model{}.ckpt".format(epoch))
            torch.save(model, model_fname)
            # Save the model if the validation loss is the best we've seen so far.
            if not best_val_loss or val_loss < best_val_loss:
                best_fname = os.path.join(track.trial_dir(), "best.ckpt")
                with open(best_fname, 'wb') as f:
                    torch.save(model, f)
                best_val_loss = val_loss
            else:
                # Anneal the learning rate if no improvement has been seen in the validation dataset.
                lr /= 4.0
    except KeyboardInterrupt:
Exemplo n.º 12
0
def do_training(args):

    hyperparameters = {
        'lr': args.lr,
        'epochs': args.epochs,
        'resume_from': 0,
        'coco_version': args.coco_version,  #can be either '2014' or '2017'
        'batch_size': args.batch_size,
        'weight_decay': args.weight_decay,
        'momentum': args.momentum,
        'optimizer': args.optimizer,
        'alpha': args.alpha,
        'gamma': args.gamma,
        'lcoord': args.lcoord,
        'lno_obj': args.lno_obj,
        'iou_type': tuple(int(a) for a in tuple(args.iou_type)),
        'iou_ignore_thresh': args.iou_ignore_thresh,
        'tfidf': args.tfidf,
        'idf_weights': True,
        'tfidf_col_names': ['img_freq', 'none', 'none', 'none', 'no_softmax'],
        'wasserstein': args.wasserstein,
        'inf_confidence': args.inf_confidence,
        'inf_iou_threshold': args.inf_iou_threshold,
        'augment': args.augment,
        'workers': 1,
        'pretrained': args.is_pretrained,
        'path': args.trial_id,
        'reduction': args.reduction
    }

    mode = {
        'bayes_opt': False,
        'multi_scale': args.multi_scale,
        'show_hp': args.show_hp,
        'show_output': args.show_output,
        'multi_gpu': False,
        'train_subset': args.train_subset,
        'test_subset': args.test_subset,
        'show_temp_summary': args.show_temp_summary,
        'save_summary': False
    }

    this_proj = track.Project("./logs/" + args.experimentname)
    if (args.resume == 'last'):
        this_proj = track.Project("./logs/" + args.experimentname)
        most_recent = this_proj.ids["start_time"].nlargest(2).idxmin()
        most_recent_id = this_proj.ids["trial_id"].iloc[[most_recent]]
        PATH = os.path.join("./logs/" + args.experimentname,
                            most_recent_id.item())
        hyperparameters['path'] = os.path.join(PATH, 'last.tar')
        args.resume = most_recent_id.item()
    elif (args.resume == 'best'):
        ids = this_proj.ids["trial_id"]
        res = this_proj.results(ids)
        best_map = res["coco_stats:map_all"].idxmax()
        best_map_id = res["trial_id"].iloc[[best_map]]
        PATH = os.path.join("./logs/" + args.experimentname,
                            best_map_id.item())
        hyperparameters['path'] = os.path.join(PATH, 'best.tar')
        args.resume = best_map_id.item()
    else:
        PATH = os.path.join("./logs/" + args.experimentname, args.resume)
        hyperparameters['path'] = os.path.join(PATH, 'last.tar')

    coco_version = hyperparameters['coco_version']
    mAP_best = 0
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

    model, optimizer, hyperparameters, PATH = init_model.init_model(
        hyperparameters, mode)

    model.hp = hyperparameters
    model.mode = mode

    if type(model) is nn.DataParallel:
        inp_dim = model.module.inp_dim
    else:
        inp_dim = model.inp_dim

    if hyperparameters['augment'] > 0:
        train_dataset = Coco(partition='train',
                             coco_version=coco_version,
                             subset=mode['train_subset'],
                             transform=transforms.Compose([
                                 Augment(hyperparameters['augment']),
                                 ResizeToTensor(inp_dim)
                             ]))
    else:
        train_dataset = Coco(partition='train',
                             coco_version=coco_version,
                             subset=mode['train_subset'],
                             transform=transforms.Compose(
                                 [ResizeToTensor(inp_dim)]))

    batch_size = hyperparameters['batch_size']

    train_dataloader = DataLoader(train_dataset,
                                  batch_size=batch_size,
                                  shuffle=True,
                                  collate_fn=helper.collate_fn,
                                  num_workers=hyperparameters['workers'],
                                  pin_memory=True)

    test_dataset = Coco(partition='val',
                        coco_version=coco_version,
                        subset=mode['test_subset'],
                        transform=transforms.Compose([ResizeToTensor(inp_dim)
                                                      ]))

    test_dataloader = DataLoader(test_dataset,
                                 batch_size=args.eval_batch_size,
                                 shuffle=False,
                                 collate_fn=helper.collate_fn,
                                 num_workers=1,
                                 pin_memory=True)

    # Calculate total number of model parameters
    num_params = sum(p.numel() for p in model.parameters())
    track.metric(iteration=0, num_params=num_params)

    for epoch in range(args.epochs):
        track.debug("Starting epoch %d" % epoch)
        #         args.lr = adjust_learning_rate(epoch, optimizer, args.lr, args.schedule,
        #                                        args.gamma)

        outcome = train(train_dataloader, model, optimizer, epoch)

        mAP = 0
        mAP = test(test_dataloader, model, epoch, device)

        track.debug(
            'Finished epoch %d... | train loss %.3f | avg_iou %.3f | avg_conf %.3f | avg_no_conf %.3f'
            '| avg_pos %.3f | avg_neg %.5f | mAP %.5f' %
            (epoch, outcome['avg_loss'], outcome['avg_iou'],
             outcome['avg_conf'], outcome['avg_no_conf'], outcome['avg_pos'],
             outcome['avg_neg'], mAP))

        model_fname = os.path.join(track.trial_dir(), "last.tar")
        torch.save(
            {
                'model_state_dict':
                model.module.state_dict()
                if type(model) is nn.DataParallel else model.state_dict(),
                'optimizer_state_dict':
                optimizer.state_dict(),
                'avg_loss':
                outcome['avg_loss'],
                'avg_iou':
                outcome['avg_iou'],
                'avg_pos':
                outcome['avg_pos'],
                'avg_neg':
                outcome['avg_neg'],
                'avg_conf':
                outcome['avg_conf'],
                'avg_no_conf':
                outcome['avg_no_conf'],
                'mAP':
                mAP,
                'hyperparameters':
                hyperparameters
            }, model_fname)

        if mAP > mAP_best:
            mAP_best = mAP
            best_fname = os.path.join(track.trial_dir(), "best.tar")
            track.debug("New best score! Saving model")

            torch.save(
                {
                    'model_state_dict':
                    model.module.state_dict()
                    if type(model) is nn.DataParallel else model.state_dict(),
                    'optimizer_state_dict':
                    optimizer.state_dict(),
                    'avg_loss':
                    outcome['avg_loss'],
                    'avg_iou':
                    outcome['avg_iou'],
                    'avg_pos':
                    outcome['avg_pos'],
                    'avg_neg':
                    outcome['avg_neg'],
                    'avg_conf':
                    outcome['avg_conf'],
                    'avg_no_conf':
                    outcome['avg_no_conf'],
                    'mAP':
                    mAP,
                    'hyperparameters':
                    hyperparameters
                }, best_fname)
Exemplo n.º 13
0
def test(testloader, model, epoch, device):
    # FIXME remove this and make paste_masks_in_image run on the GPU
    cpu_device = torch.device("cpu")

    device = device

    batch_time = AverageMeter()
    data_time = AverageMeter()

    hyperparameters = model.hp
    confidence = hyperparameters['inf_confidence']
    iou_threshold = hyperparameters['inf_iou_threshold']

    if type(model) is nn.DataParallel:
        inp_dim = model.module.inp_dim
        pw_ph = model.module.pw_ph
        cx_cy = model.module.cx_cy
        stride = model.module.stride
    else:
        inp_dim = model.inp_dim
        pw_ph = model.pw_ph
        cx_cy = model.cx_cy
        stride = model.stride

    pw_ph = pw_ph.to(device)
    cx_cy = cx_cy.to(device)
    stride = stride.to(device)

    sys.stdout = open(os.devnull, 'w')  #wrapper to disable hardcoded printing
    coco = coco_utils.get_coco_api_from_dataset(testloader.dataset)
    iou_types = ["bbox"]
    coco_evaluator = coco_eval.CocoEvaluator(coco, iou_types)
    sys.stdout = sys.__stdout__  #wrapper to enable hardcoded printing (return to normal mode)

    # switch to evaluate mode
    model.eval()

    end = time.time()
    with torch.no_grad():
        for batch_idx, (images, targets) in enumerate(testloader):
            # measure data loading time
            data_time.update(time.time() - end)

            images = images.to(device)

            targets2 = []
            for t in targets:
                dd = {}
                for k, v in t.items():
                    if (k != 'img_size'):
                        dd[k] = v.to(device)
                    else:
                        dd[k] = v
                targets2.append(dd)

    #         targets = [{k: v.to(device) for k, v in t.items()} for t in targets]
            targets = targets2

            raw_pred = model(images, device)

            true_pred = util.transform(raw_pred.clone().detach(), pw_ph, cx_cy,
                                       stride)

            sorted_pred = torch.sort(true_pred[:, :, 4] *
                                     (true_pred[:, :, 5:].max(axis=2)[0]),
                                     descending=True)
            pred_mask = sorted_pred[0] > confidence
            indices = [(sorted_pred[1][e, :][pred_mask[e, :]])
                       for e in range(pred_mask.shape[0])]
            pred_final = [
                true_pred[i, indices[i], :] for i in range(len(indices))
            ]

            pred_final_coord = [
                util.get_abs_coord(pred_final[i].unsqueeze(-2))
                for i in range(len(pred_final))
            ]

            indices = [
                nms_box.nms(pred_final_coord[i][0], pred_final[i][:, 4],
                            iou_threshold) for i in range(len(pred_final))
            ]
            pred_final = [
                pred_final[i][indices[i], :] for i in range(len(pred_final))
            ]

            abs_pred_final = [
                helper.convert2_abs_xyxy(pred_final[i], targets[i]['img_size'],
                                         inp_dim)
                for i in range(len(pred_final))
            ]

            outputs = [dict() for i in range(len((abs_pred_final)))]
            for i, atrbs in enumerate(abs_pred_final):

                outputs[i]['boxes'] = atrbs[:, :4]
                outputs[i]['scores'] = pred_final[i][:, 4]
                try:
                    outputs[i]['labels'] = pred_final[i][:, 5:].max(
                        axis=1)[1] + 1  #could be empty
                except:

                    outputs[i]['labels'] = torch.tensor([])

            outputs = [{k: v.to(cpu_device)
                        for k, v in t.items()} for t in outputs]

            res = {
                target["image_id"].item(): output
                for target, output in zip(targets, outputs)
            }
            coco_evaluator.update(res)

            # measure elapsed time
            batch_time.update(time.time() - end)
            end = time.time()

    sys.stdout = open(os.devnull, 'w')  #wrapper to disable hardcoded printing

    coco_evaluator.synchronize_between_processes()
    # accumulate predictions from all images
    coco_evaluator.accumulate()
    coco_evaluator.summarize()

    metrics = coco_evaluator.get_stats()

    sys.stdout = sys.__stdout__  #wrapper to enable hardcoded printing (return to normal mode)

    coco_stats = {
        'map_all': metrics[0],
        '[email protected]': metrics[1],
        '[email protected]': metrics[2],
        'map_small': metrics[3],
        'map_med': metrics[4],
        'map_large': metrics[5],
        'recall@1': metrics[6],
        'recall@10': metrics[7],
        'recall@100': metrics[8],
        'recall@small': metrics[9],
        'recall@medium': metrics[10],
        'recall@large': metrics[11]
    }

    track.metric(iteration=0, epoch=epoch, coco_stats=coco_stats)

    return (metrics[0])
Exemplo n.º 14
0
def train(trainloader, model, optimizer, epoch, cuda=True):
    # switch to train mode
    model.train()
    hyperparameters = model.hp
    mode = model.mode

    if type(model) is nn.DataParallel:
        inp_dim = model.module.inp_dim
        pw_ph = model.module.pw_ph
        cx_cy = model.module.cx_cy
        stride = model.module.stride
    else:
        inp_dim = model.inp_dim
        pw_ph = model.pw_ph
        cx_cy = model.cx_cy
        stride = model.stride

    if cuda:
        pw_ph = pw_ph.cuda()
        cx_cy = cx_cy.cuda()
        stride = stride.cuda()

    batch_time = AverageMeter()
    data_time = AverageMeter()
    avg_loss = AverageMeter()
    avg_iou = AverageMeter()
    avg_conf = AverageMeter()
    avg_no_conf = AverageMeter()
    avg_pos = AverageMeter()
    avg_neg = AverageMeter()
    end = time.time()
    break_flag = 0

    if mode['show_temp_summary'] == True:
        writer = SummaryWriter(os.path.join(track.trial_dir(), 'temp_vis/'))

    for batch_idx, (inputs, targets) in enumerate(trainloader):
        # measure data loading time
        data_time.update(time.time() - end)

        if cuda:
            inputs = inputs.cuda()

        # compute output
        raw_pred = model(inputs, torch.cuda.is_available())
        true_pred = util.transform(raw_pred.clone().detach(), pw_ph, cx_cy,
                                   stride)
        iou_list = util.get_iou_list(true_pred, targets, hyperparameters,
                                     inp_dim)

        resp_raw_pred, resp_cx_cy, resp_pw_ph, resp_stride, no_obj = util.build_tensors(
            raw_pred, iou_list, pw_ph, cx_cy, stride, hyperparameters)

        stats = helper.get_progress_stats(true_pred, no_obj, iou_list, targets)
        if hyperparameters['wasserstein'] == True:
            no_obj = util.get_wasserstein_matrices(raw_pred, iou_list, inp_dim)

        try:
            loss = util.yolo_loss(resp_raw_pred, targets, no_obj, resp_pw_ph,
                                  resp_cx_cy, resp_stride, inp_dim,
                                  hyperparameters)
        except RuntimeError:
            print('bayes opt failed')
            break_flag = 1
            break

        # measure accuracy and record loss
        avg_loss.update(loss.item())
        avg_iou.update(stats['iou'])
        avg_conf.update(stats['pos_conf'])
        avg_no_conf.update(stats['neg_conf'])
        avg_pos.update(stats['pos_class'])
        avg_neg.update(stats['neg_class'])

        # compute gradient and do SGD step
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # measure elapsed time
        batch_time.update(time.time() - end)
        end = time.time()

        if mode['show_output'] == True:  # plot progress
            progress_str = 'Loss: %.4f | AvIoU: %.3f | AvPConf: %.3f | AvNConf: %.5f | AvClass: %.3f | AvNClass: %.5f'\
                % (loss.item(), stats['iou'], stats['pos_conf'], stats['neg_conf'],stats['pos_class'],stats['neg_class'])
            progress_bar(batch_idx, len(trainloader), progress_str)

        iteration = epoch * len(trainloader) + batch_idx

        if mode['show_temp_summary'] == True:
            writer.add_scalar('AvLoss/train', avg_loss.avg, iteration)
            writer.add_scalar('AvIoU/train', avg_iou.avg, iteration)
            writer.add_scalar('AvPConf/train', avg_conf.avg, iteration)
            writer.add_scalar('AvNConf/train', avg_no_conf.avg, iteration)
            writer.add_scalar('AvClass/train', avg_pos.avg, iteration)
            writer.add_scalar('AvNClass/train', avg_neg.avg, iteration)

    track.metric(iteration=iteration,
                 epoch=epoch,
                 avg_train_loss=avg_loss.avg,
                 avg_train_iou=avg_iou.avg,
                 avg_train_conf=avg_conf.avg,
                 avg_train_neg_conf=avg_no_conf.avg,
                 avg_train_pos=avg_pos.avg,
                 avg_train_neg=avg_neg.avg)

    outcome = {
        'avg_loss': avg_loss.avg,
        'avg_iou': avg_iou.avg,
        'avg_pos': avg_pos.avg,
        'avg_neg': avg_neg.avg,
        'avg_conf': avg_conf.avg,
        'avg_no_conf': avg_no_conf.avg,
        'broken': break_flag
    }

    return outcome
Exemplo n.º 15
0
def main():
    global args, best_prec1
    args = parser.parse_args()

    args.distributed = args.world_size > 1

    if args.distributed:
        dist.init_process_group(backend=args.dist_backend,
                                init_method=args.dist_url,
                                world_size=args.world_size)

    # create model
    if args.pretrained:
        print("=> using pre-trained model '{}'".format(args.arch))
        model = models.__dict__[args.arch](pretrained=True)
    else:
        print("=> creating model '{}'".format(args.arch))
        model = models.__dict__[args.arch]()

    if not args.distributed:
        if args.arch.startswith('alexnet') or args.arch.startswith('vgg'):
            model.features = torch.nn.DataParallel(model.features)
            model.cuda()
        else:
            model = torch.nn.DataParallel(model).cuda()
    else:
        model.cuda()
        model = torch.nn.parallel.DistributedDataParallel(model)

    # define loss function (criterion) and optimizer
    criterion = nn.CrossEntropyLoss().cuda()

    if args.sqrt_lr:
        lr = args.lr * math.sqrt(args.batch_size / 32.)
    else:
        lr = args.lr

    optimizer = torch.optim.SGD(model.parameters(),
                                lr,
                                momentum=args.momentum,
                                weight_decay=args.weight_decay)

    # optionally resume from a checkpoint
    if args.resume:
        if os.path.isfile(args.resume):
            print("=> loading checkpoint '{}'".format(args.resume))
            checkpoint = torch.load(args.resume)
            args.start_epoch = checkpoint['epoch']
            best_prec1 = checkpoint['best_prec1']
            model.load_state_dict(checkpoint['state_dict'])
            optimizer.load_state_dict(checkpoint['optimizer'])
            print("=> loaded checkpoint '{}' (epoch {})".format(
                args.resume, checkpoint['epoch']))
        else:
            print("=> no checkpoint found at '{}'".format(args.resume))

    cudnn.benchmark = True

    # Data loading code
    traindir = os.path.join(args.data, 'train')
    valdir = os.path.join(args.data, 'val')
    normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                     std=[0.229, 0.224, 0.225])

    train_dataset = datasets.ImageFolder(
        traindir,
        transforms.Compose([
            transforms.RandomResizedCrop(224),
            transforms.RandomHorizontalFlip(),
            transforms.ToTensor(),
            normalize,
        ]))

    if args.distributed:
        train_sampler = torch.utils.data.distributed.DistributedSampler(
            train_dataset)
    else:
        train_sampler = None

    train_loader = torch.utils.data.DataLoader(train_dataset,
                                               batch_size=min(
                                                   args.batch_size,
                                                   args.max_samples),
                                               shuffle=(train_sampler is None),
                                               num_workers=args.workers,
                                               pin_memory=True,
                                               sampler=train_sampler)

    val_loader = torch.utils.data.DataLoader(datasets.ImageFolder(
        valdir,
        transforms.Compose([
            transforms.Resize(256),
            transforms.CenterCrop(224),
            transforms.ToTensor(),
            normalize,
        ])),
                                             batch_size=args.max_samples,
                                             shuffle=False,
                                             num_workers=args.workers,
                                             pin_memory=True)

    if args.evaluate:
        validate(val_loader, model, criterion)
        return

    with track.trial(args.logroot,
                     None,
                     param_map={'batch_size': args.batch_size}):
        for epoch in range(args.start_epoch, args.epochs):
            if args.distributed:
                train_sampler.set_epoch(epoch)
            adjust_learning_rate(optimizer, epoch)

            # train for one epoch
            train_loss = train(train_loader, model, criterion, optimizer,
                               epoch)

            # evaluate on validation set
            with torch.no_grad():
                val_loss, prec1 = validate(val_loader, model, criterion)

            track.metric(iteration=epoch,
                         train_loss=train_loss,
                         test_loss=val_loss,
                         prec=prec1)
            # Log model
            model_fname = os.path.join(track.trial_dir(),
                                       "model{}.ckpt".format(epoch))
            torch.save(model, model_fname)

            # Save the model if the validation loss is the best we've seen so far.
            # remember best prec@1 and save checkpoint
            is_best = prec1 > best_prec1
            best_prec1 = max(prec1, best_prec1)
            if is_best:
                best_fname = os.path.join(track.trial_dir(), "best.ckpt")
                with open(best_fname, 'wb') as f:
                    torch.save(model, f)
Exemplo n.º 16
0
    # Save checkpoint.
    acc = 100.0 * correct / total
    if acc > best_acc:
        print("Saving..")
        state = {"net": net.state_dict(), "acc": acc, "epoch": epoch}
        if not os.path.isdir("checkpoint"):
            os.mkdir("checkpoint")
        ckpt_path = os.path.join(track.trial_dir(), "ckpt.pth")
        torch.save(state, ckpt_path)
        best_acc = acc
    test_loss = test_loss / len(testloader)
    return test_loss, acc, best_acc


with track.trial(args.logroot, None, param_map=vars(args)):
    for epoch in range(start_epoch, start_epoch + 200):
        train_loss, train_acc = train(epoch)
        test_loss, test_acc, best_acc = test(epoch)
        track.metric(
            iteration=epoch,
            train_loss=train_loss,
            train_acc=train_acc,
            test_loss=test_loss,
            test_acc=test_acc,
            best_acc=best_acc,
        )
        track.debug(
            f"epoch {epoch} finished with stats: best_acc = {best_acc} | train_acc = {train_acc} | test_acc = {test_acc} | train_loss = {train_loss} | test_loss = {test_loss}"
        )
Exemplo n.º 17
0
def run(ensemble,
        trial_df,
        results_dir='./logs',
        dataroot='./data',
        class_ind=0,
        batch_size=128,
        eval_batch_size=100,
        cuda=False,
        num_workers=2,
        start_epoch=160,
        end_epoch=200,
        **unused):

    trainloader, testloader = build_dataset('cifar10',
                                            dataroot=dataroot,
                                            batch_size=batch_size,
                                            eval_batch_size=eval_batch_size,
                                            num_workers=2)

    # this will only iterate over examples of one class
    class_trainlaoder, class_testloader = build_single_class_dataset(
        'cifar10',
        class_ind=class_ind,
        dataroot=dataroot,
        batch_size=batch_size,
        eval_batch_size=eval_batch_size,
        num_workers=2)

    full_ensemble = ensemble
    track.debug("[ensemble_size] starting to test all ensembles (class = %d)" %
                class_ind)
    for i in range(len(ensemble.models)):
        ensemble_size = i + 1
        model_ind = len(ensemble.models) - 1 - i
        track.debug("[ensemble_size] starting size %d / %d ensemble" %
                    (i + 1, len(ensemble.models)))
        ensemble_loss = SoftmaxNLL()
        one_loss = CrossEntropyLoss()

        entropy_criterion = Entropy()

        ensemble = Ensemble(full_ensemble.models[model_ind:])
        single_model = full_ensemble.models[model_ind]

        # we want to do metrics for (a) the ensemble with varying sizes and
        #   (b) the individual models corresponding to that epoch
        def _test_dataset(model, testloader, criterion):
            loss, acc = test(testloader,
                             model,
                             criterion,
                             epoch=-1,
                             cuda=cuda,
                             metric=False)
            # compute the entropy of the model post-hoc as well
            entropy = test(testloader,
                           model,
                           entropy_criterion,
                           epoch=-1,
                           cuda=cuda,
                           metric=False,
                           criterion_has_labels=False,
                           compute_acc=False)
            return loss, acc, entropy

        # metrics for the both models over both datasets
        # (a) on the whole dataset
        #      (i) for the ensemble
        #      (ii)for the single model from this epoch
        # (b) on a single class
        #      (i) for the ensemble
        #      (ii)for the single model from this epoch
        stats = {}
        models = (ensemble, single_model)
        loaders = (testloader, class_testloader)
        losses = ensemble_loss, one_loss
        model_names = ['ensemble', 'single_model']
        loader_names = ['full', 'single_class']
        for i, j in itertools.product(range(len(models)), range(len(loaders))):
            track.debug("[ensemble size: %d] Evaluating loss/acc/entropy for "
                        "%s on %s dataset" %
                        (ensemble_size, model_names[i], loader_names[i]))
            metric = model_names[i] + '_' + loader_names[i]
            loss, acc, entropy = _test_dataset(models[i], loaders[j],
                                               losses[i])
            stats[metric + '_loss'] = loss
            stats[metric + '_acc'] = acc
            stats[metric + '_entropy'] = entropy
        track.metric(ensemble_size=ensemble_size, **stats)
Exemplo n.º 18
0
def test(testloader,
         model,
         criterion,
         epoch,
         cuda=False,
         metric=True,
         criterion_has_labels=True,
         compute_acc=True):
    """
    criterion = torch.nn.Loss instance.
    criterion_has_labels (bool): if true, the above criterion is called as
        criterion(outputs, labels). otherwise, just criterion(outputs).

    returns (test_loss, test_acc) if compute_acc is True
    otherwise, returns test_loss alone
    """
    batch_time = AverageMeter()
    data_time = AverageMeter()
    losses = AverageMeter()
    top1 = AverageMeter()
    top5 = AverageMeter()

    # switch to evaluate mode
    model.eval()

    end = time.time()
    with torch.no_grad():
        for batch_idx, (inputs, targets) in enumerate(testloader):
            # measure data loading time
            data_time.update(time.time() - end)
            if cuda:
                inputs, targets = inputs.cuda(), targets.cuda()
            with torch.no_grad():
                # compute output
                outputs = model(inputs)
                if criterion_has_labels:
                    loss = criterion(outputs, targets)
                else:
                    loss = criterion(outputs)

            # measure accuracy and record loss
            losses.update(loss.item(), inputs.size(0))
            if compute_acc:
                prec1, prec5 = accuracy(outputs.data,
                                        targets.data,
                                        topk=(1, 5))
                top1.update(prec1.item(), inputs.size(0))
                top5.update(prec5.item(), inputs.size(0))

            # measure elapsed time
            batch_time.update(time.time() - end)
            end = time.time()

            # plot progress
            if compute_acc:
                progress_str = 'Loss: %.3f | Acc: %.3f%% (%d/%d)'\
                    % (losses.avg, top1.avg, top1.sum, top1.count)
            else:
                progress_str = 'Loss: %.3f (%d/%d)'\
                    % (losses.avg, batch_idx*inputs.size(0), losses.count)

            progress_bar(batch_idx, len(testloader), progress_str)
    if metric:
        track.metric(iteration=0,
                     epoch=epoch,
                     avg_test_loss=losses.avg,
                     avg_test_acc=top1.avg)
    if compute_acc:
        return (losses.avg, top1.avg)
    else:
        return losses.avg
Exemplo n.º 19
0
def train(trainloader,
          model,
          criterion,
          optimizer,
          epoch,
          cuda=False,
          num_chunks=4):
    # switch to train mode
    model.train()

    batch_time = AverageMeter()
    data_time = AverageMeter()
    losses = AverageMeter()
    top1 = AverageMeter()
    top5 = AverageMeter()
    end = time.time()

    for batch_idx, (all_inputs, all_targets) in enumerate(trainloader):
        # measure data loading time
        data_time.update(time.time() - end)

        # do mini-mini-batching for large batch sizes
        xs = all_inputs.chunk(num_chunks)
        ys = all_targets.chunk(num_chunks)

        optimizer.zero_grad()
        batch_prec1 = 0.0
        batch_loss = 0.0
        for (inputs, targets) in zip(xs, ys):
            if cuda:
                inputs, targets = inputs.cuda(), targets.cuda(async=True)

            # compute output
            outputs = model(inputs)
            mini_loss = criterion(outputs, targets) / num_chunks
            batch_loss += mini_loss.item()

            mini_loss.backward()

            # measure accuracy and record loss
            prec1, prec5 = accuracy(outputs.data, targets.data, topk=(1, 5))
            batch_prec1 += prec1.item() / num_chunks

            losses.update(num_chunks * mini_loss.item(), inputs.size(0))
            top1.update(prec1.item(), inputs.size(0))
            top5.update(prec5.item(), inputs.size(0))

        # compute gradient and do SGD step
        optimizer.step(epoch)

        # measure elapsed time
        batch_time.update(time.time() - end)
        end = time.time()

        # plot progress
        progress_str = 'Loss: %.3f | Acc: %.3f%% (%d/%d)'\
            % (losses.avg, top1.avg, top1.sum, top1.count)
        progress_bar(batch_idx, len(trainloader), progress_str)

        iteration = epoch * len(trainloader) + batch_idx
        track.metric(iteration=iteration,
                     epoch=epoch,
                     avg_train_loss=losses.avg,
                     avg_train_acc=top1.avg,
                     cur_train_loss=batch_loss,
                     cur_train_acc=batch_prec1)
    return (losses.avg, top1.avg)