def train(train_dataprovider, val_dataprovider, optimizer, scheduler, model,
          archloader, criterion, args, val_iters, seed):
    objs, top1 = AvgrageMeter(), AvgrageMeter()

    for p in model.parameters():
        p.grad = torch.zeros_like(p)

    for step in range(args.total_iters):
        model.train()
        t0 = time.time()
        image, target = train_dataprovider.next()
        datatime = time.time() - t0
        n = image.size(0)
        optimizer.zero_grad()
        image = Variable(image, requires_grad=False).cuda(args.gpu)
        target = Variable(target, requires_grad=False).cuda(args.gpu)

        # Fair Sampling
        # rngs = []
        # for i in range(len(operations)):  # 21个layer
        #     seed += 1
        #     random.seed(seed)
        #     rngs.append(random.sample(operations[i], len(operations[i])))
        # rngs = np.transpose(rngs)

        fair_arc_list = archloader.generate_niu_fair_batch()

        for ii, arc in enumerate(fair_arc_list):
            logits = model(image, archloader.convert_list_arc_str(arc))
            loss = criterion(logits, target)
            loss.backward()

        # for rng in rngs:
        #     logits = model(image, rng)
        #     loss = criterion(logits, target)
        #     loss.backward()

        nn.utils.clip_grad_value_(model.parameters(), args.grad_clip)
        optimizer.step()
        scheduler.step()

        prec1, _ = accuracy(logits, target, topk=(1, 5))
        objs.update(loss.data.item(), n)
        top1.update(prec1.data.item(), n)

        if step % args.report_freq == 0 and args.local_rank == 0:
            now = time.strftime('%Y-%m-%d %H:%M:%S',
                                time.localtime(time.time()))
            print(
                '{} |=> train: {} / {}, lr={}, loss={:.2f}, acc={:.2f}, datatime={:.2f}, seed={}'
                .format(now, step, args.total_iters,
                        scheduler.get_lr()[0], objs.avg, top1.avg,
                        float(datatime), seed))

    if args.local_rank == 0:
        now = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))
        print('{} |=> Test rng = {}'.format(now, fair_arc_list[0]))
        infer(val_dataprovider, model.module, criterion, fair_arc_list,
              val_iters, archloader)
Exemplo n.º 2
0
def train(train_dataprovider, val_dataprovider, optimizer, scheduler, model, archloader, criterion, args, val_iters, seed, writer=None):
    objs, top1 = AvgrageMeter(), AvgrageMeter()

    for p in model.parameters():
        p.grad = torch.zeros_like(p)

    for step in range(args.total_iters):
        model.train()
        t0 = time.time()
        image, target = train_dataprovider.next()
        datatime = time.time() - t0
        n = image.size(0)
        optimizer.zero_grad()
        image = Variable(image, requires_grad=False).cuda(args.gpu)
        target = Variable(target, requires_grad=False).cuda(args.gpu)

        # Fair Sampling
        fair_arc_list = archloader.generate_niu_fair_batch()

        for arc in fair_arc_list:
            logits = model(image, archloader.convert_list_arc_str(arc))
            loss = criterion(logits, target)
            loss_reduce = reduce_tensor(loss, 0, args.world_size)
            loss.backward()

        nn.utils.clip_grad_value_(model.parameters(), args.grad_clip)
        optimizer.step()
        scheduler.step()

        prec1, _ = accuracy(logits, target, topk=(1, 5))
        objs.update(loss_reduce.data.item(), n)
        top1.update(prec1.data.item(), n)

        if step % args.report_freq == 0 and args.local_rank == 0:
            now = time.strftime('%Y-%m-%d %H:%M:%S',
                                time.localtime(time.time()))
            print('{} |=> train: {} / {}, lr={}, loss={:.2f}, acc={:.2f}, datatime={:.2f}, seed={}'
                  .format(now, step, args.total_iters, scheduler.get_lr()[0], objs.avg, top1.avg, float(datatime), seed))

        if args.local_rank == 0 and step % 5 == 0 and writer is not None:
            writer.add_scalar("Train/loss", objs.avg, step)
            writer.add_scalar("Train/acc1", top1.avg, step)

        if args.local_rank == 0 and step % args.report_freq == 0:
            # model

            top1_val, objs_val = infer(train_dataprovider, val_dataprovider, model.module, criterion,
                                       fair_arc_list, val_iters, archloader)

            if writer is not None:
                writer.add_scalar("Val/loss", objs_val, step)
                writer.add_scalar("Val/acc1", top1_val, step)

            save_checkpoint(
                {'state_dict': model.state_dict(), }, step, args.exp)
def infer(val_dataprovider, model, criterion, fair_arc_list, val_iters,
          archloader):
    objs = AvgrageMeter()
    top1 = AvgrageMeter()
    model.eval()
    now = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))
    print('{} |=> Test rng = {}'.format(now, fair_arc_list[0]))

    with torch.no_grad():
        for step in range(val_iters):
            t0 = time.time()
            image, target = val_dataprovider.next()
            datatime = time.time() - t0
            image = Variable(image, requires_grad=False).cuda()
            target = Variable(target, requires_grad=False).cuda()
            logits = model(image,
                           archloader.convert_list_arc_str(fair_arc_list[0]))
            loss = criterion(logits, target)

            prec1, _ = accuracy(logits, target, topk=(1, 5))
            n = image.size(0)
            objs.update(loss.data.item(), n)
            top1.update(prec1.data.item(), n)

        now = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))
        print(
            '{} |=> valid: step={}, loss={:.2f}, acc={:.2f}, datatime={:.2f}'.
            format(now, step, objs.avg, top1.avg, datatime))

    return top1.avg, objs.avg
Exemplo n.º 4
0
    def train_fn(self, optimizer, criterion, loader, device, train=True):
        """
        Training method
        :param optimizer: optimization algorithm
        :criterion: loss function
        :param loader: data loader for either training or testing set
        :param device: torch device
        :param train: boolean to indicate if training or test set is used
        :return: (accuracy, loss) on the data
        """
        score = AvgrageMeter()
        objs = AvgrageMeter()
        self.train()

        t = tqdm(loader)
        for images, labels in t:
            images = images.to(device)
            labels = labels.to(device)
            optimizer.zero_grad()
            logits = self(images)
            loss = criterion(logits, labels)
            loss.backward()
            optimizer.step()

            acc, _ = accuracy(logits, labels, topk=(1, 5))
            n = images.size(0)
            objs.update(loss.item(), n)
            score.update(acc.item(), n)

            t.set_description('(=> Training) Loss: {:.4f}'.format(objs.avg))

        return score.avg, objs.avg
Exemplo n.º 5
0
def validate(model, args, *, arch_loader=None):
    assert arch_loader is not None

    objs = AvgrageMeter()
    top1 = AvgrageMeter()
    top5 = AvgrageMeter()

    val_dataloader = args.val_dataloader

    model.eval()

    t1 = time.time()

    result_dict = {}

    # base_model = mutableResNet20().cuda()

    with torch.no_grad():
        for key, arch in tqdm(arch_loader):
            # print(key, arch)
            # max_val_iters += 1
            # print('\r ', key, ' iter:', max_val_iters, end='')

            retrain_bn(model,
                       max_iters=5,
                       dataprovider=DataIterator(val_dataloader),
                       device=0,
                       cand=arch[0])

            for data, target in val_dataloader:  # 过一遍数据集
                target = target.type(torch.LongTensor)
                data, target = data.cuda(args.gpu), target.cuda(args.gpu)

                output = model(data, arch[0])

                prec1, prec5 = accuracy(output, target, topk=(1, 5))

                n = data.size(0)

                top1.update(prec1.item(), n)
                top5.update(prec5.item(), n)

            print("\t acc1: ", top1.avg)
            tmp_dict = {}
            tmp_dict['arch'] = arch[0]
            tmp_dict['acc'] = top1.avg

            result_dict[key[0]] = tmp_dict

    with open("acc_result_rank_%d.json" % args.local_rank, "w") as f:
        json.dump(result_dict, f)
def train(train_dataloader, val_dataloader, optimizer, scheduler, model, archloader, criterion, args, seed, epoch, writer=None):
    losses_, top1_, top5_ = AvgrageMeter(), AvgrageMeter(), AvgrageMeter()

    # for p in model.parameters():
    #     p.grad = torch.zeros_like(p)
    model.train()

    train_loader = tqdm(train_dataloader)
    train_loader.set_description(
        '[%s%04d/%04d %s%f]' % ('Epoch:', epoch + 1, args.epochs, 'lr:', scheduler.get_last_lr()[0]))
    for step, (image, target) in enumerate(train_loader):
        n = image.size(0)
        image = Variable(image, requires_grad=False).cuda(
            args.gpu, non_blocking=True)
        target = Variable(target, requires_grad=False).cuda(
            args.gpu, non_blocking=True)

        # Fair Sampling
        # [archloader.generate_niu_fair_batch(step)[-1]]
        # [16, 16, 16, 16, 16, 16, 16, 32, 32, 32, 32, 32, 32, 64, 64, 64, 64, 64, 64, 64]
        spos_arc_list = archloader.generate_spos_like_batch().tolist()

        # for arc in fair_arc_list:
        # logits = model(image, archloader.convert_list_arc_str(arc))
        # loss = criterion(logits, target)
        # loss_reduce = reduce_tensor(loss, 0, args.world_size)
        # loss.backward()
        optimizer.zero_grad()
        logits = model(image, spos_arc_list[:-1])
        loss = criterion(logits, target)
        prec1, prec5 = accuracy(logits, target, topk=(1, 5))

        if torch.cuda.device_count() > 1:
            torch.distributed.barrier()

            loss = reduce_mean(loss, args.nprocs)
            prec1 = reduce_mean(prec1, args.nprocs)
            prec5 = reduce_mean(prec5, args.nprocs)

        loss.backward()

        # nn.utils.clip_grad_value_(model.parameters(), args.grad_clip)

        optimizer.step()

        losses_.update(loss.data.item(), n)
        top1_.update(prec1.data.item(), n)
        top5_.update(prec1.data.item(), n)

        postfix = {'train_loss': '%.6f' % (
            losses_.avg), 'train_acc1': '%.6f' % top1_.avg, 'train_acc5': '%.6f' % top5_.avg}
        train_loader.set_postfix(log=postfix)

        if args.local_rank == 0 and step % 10 == 0 and writer is not None:
            writer.add_scalar("Train/loss", losses_.avg, step +
                              len(train_dataloader) * epoch * args.batch_size)
            writer.add_scalar("Train/acc1", top1_.avg, step +
                              len(train_dataloader) * epoch * args.batch_size)
            writer.add_scalar("Train/acc5", top5_.avg, step +
                              len(train_loader)*args.batch_size*epoch)
Exemplo n.º 7
0
    def eval_fn(self, loader, device, train=False):
        """
        Evaluation method
        :param loader: data loader for either training or testing set
        :param device: torch device
        :param train: boolean to indicate if training or test set is used
        :return: accuracy on the data
        """
        score = AvgrageMeter()
        self.eval()

        t = tqdm(loader)
        with torch.no_grad():  # no gradient needed
            for images, labels in t:
                images = images.to(device)
                labels = labels.to(device)

                outputs = self(images)
                acc, _ = accuracy(outputs, labels, topk=(1, 5))
                score.update(acc.item(), images.size(0))

                t.set_description('(=> Test) Score: {:.4f}'.format(score.avg))

        return score.avg
def infer(train_loader, val_loader, model, criterion, val_iters, archloader,
          args):

    objs_, top1_, top5_ = AvgrageMeter(), AvgrageMeter(), AvgrageMeter()

    model.eval()
    now = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))

    # [16, 16, 16, 16, 16, 16, 16, 32, 32, 32, 32, 32, 32, 64, 64, 64, 64, 64, 64, 64]
    fair_arc_list = archloader.generate_niu_fair_batch(random.randint(
        0, 100))[-1].tolist()
    # archloader.generate_spos_like_batch().tolist()

    print('{} |=> Test rng = {}'.format(now, fair_arc_list))  # 只测试最后一个模型

    # BN calibration
    # retrain_bn(model, 15, train_loader, fair_arc_list, device=0)

    with torch.no_grad():
        for step, (image, target) in enumerate(val_loader):
            t0 = time.time()
            datatime = time.time() - t0
            image = Variable(image,
                             requires_grad=False).cuda(args.local_rank,
                                                       non_blocking=True)
            target = Variable(target,
                              requires_grad=False).cuda(args.local_rank,
                                                        non_blocking=True)

            logits = model(image)  # , fair_arc_list)
            loss = criterion(logits, target)

            top1, top5 = accuracy(logits, target, topk=(1, 5))

            if torch.cuda.device_count() > 1:
                torch.distributed.barrier()

                loss = reduce_mean(loss, args.nprocs)
                top1 = reduce_mean(top1, image.size(0))
                top5 = reduce_mean(top5, image.size(0))

            n = image.size(0)
            objs_.update(loss.data.item(), n)
            top1_.update(top1.data.item(), n)
            top5_.update(top5.data.item(), n)

        now = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))
        print(
            '{} |=> valid: step={}, loss={:.2f}, val_acc1={:.2f}, val_acc5={:2f}, datatime={:.2f}'
            .format(now, step, objs_.avg, top1_.avg, top5_.avg, datatime))

    return top1_.avg, top5_.avg, objs_.avg
Exemplo n.º 9
0
def validate(model, train_loader, args, *, arch_loader=None):
    assert arch_loader is not None

    objs = AvgrageMeter()
    top1 = AvgrageMeter()
    top5 = AvgrageMeter()

    val_dataloader = args.val_dataloader

    t1 = time.time()

    result_dict = {}

    arch_loader = tqdm(arch_loader)
    for key, arch in arch_loader:
        arch_list = [int(itm) for itm in arch[0].split('-')]
        # bn calibration
        model.apply(bn_calibration_init)

        model.train()

        t1 = time.time()

        for idx, (inputs, targets) in enumerate(train_loader):
            inputs, targets = inputs.cuda(
                args.gpu, non_blocking=True), targets.cuda(args.gpu,
                                                           non_blocking=True)
            outputs = model(inputs, arch_list)
            del inputs, targets, outputs
            if idx > 2:
                break

        # print("bn calibration time:", time.time()-t1)

        t2 = time.time()

        model.eval()

        for data, target in val_dataloader:  # 过一遍数据集
            target = target.type(torch.LongTensor)
            data, target = data.cuda(args.gpu, non_blocking=True), target.cuda(
                args.gpu, non_blocking=True)

            output = model(data, arch_list)

            prec1, prec5 = accuracy(output, target, topk=(1, 5))

            n = data.size(0)

            top1.update(prec1.item(), n)
            top5.update(prec5.item(), n)

        tmp_dict = {}
        tmp_dict['arch'] = arch[0]
        tmp_dict['acc'] = top1.avg / 100

        # print("val time:", time.time() - t2)

        result_dict[key[0]] = tmp_dict

        post_fix = {"top1": "%.6f" % (top1.avg / 100)}
        arch_loader.set_postfix(log=post_fix)

    with open("acc_%s.json" % (args.path.split('/')[1].split('.')[0]),
              "w") as f:
        json.dump(result_dict, f)
Exemplo n.º 10
0
def train(train_dataloader,
          val_dataloader,
          optimizer,
          scheduler,
          model,
          archloader,
          criterion,
          soft_criterion,
          args,
          seed,
          epoch,
          writer=None):
    losses_, top1_, top5_ = AvgrageMeter(), AvgrageMeter(), AvgrageMeter()

    model.train()
    widest = [
        16, 16, 16, 16, 16, 16, 16, 32, 32, 32, 32, 32, 32, 64, 64, 64, 64, 64,
        64, 64
    ]
    narrowest = [4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4]

    train_loader = tqdm(train_dataloader)
    train_loader.set_description(
        '[%s%04d/%04d %s%f]' %
        ('Epoch:', epoch + 1, args.epochs, 'lr:', scheduler.get_last_lr()[0]))
    for step, (image, target) in enumerate(train_loader):
        n = image.size(0)
        image = Variable(image, requires_grad=False).cuda(args.gpu,
                                                          non_blocking=True)
        target = Variable(target, requires_grad=False).cuda(args.gpu,
                                                            non_blocking=True)

        if args.model_type in ["dynamic", "independent", "slimmable"]:
            # sandwich rule
            candidate_list = []
            candidate_list += [narrowest]
            candidate_list += [
                archloader.generate_spos_like_batch().tolist()
                for i in range(6)
            ]

            # archloader.generate_niu_fair_batch(step)
            # 全模型来一遍
            soft_target = model(image, widest)
            soft_loss = criterion(soft_target, target)
            soft_loss.backward()
            soft_target = torch.nn.functional.softmax(soft_target,
                                                      dim=1).detach()

            # 采样几个子网来一遍
            for arc in candidate_list:
                logits = model(image, arc)
                # loss = soft_criterion(logits, soft_target.cuda(
                #     args.gpu, non_blocking=True))
                loss = criterion(logits, target)

                # loss_reduce = reduce_tensor(loss, 0, args.world_size)
                loss.backward()
        elif args.model_type == "original":
            logits = model(image)
            loss = criterion(logits, target)
            loss.backward()

        prec1, prec5 = accuracy(logits, target, topk=(1, 5))

        if torch.cuda.device_count() > 1:
            torch.distributed.barrier()

            loss = reduce_mean(loss, args.nprocs)
            prec1 = reduce_mean(prec1, args.nprocs)
            prec5 = reduce_mean(prec5, args.nprocs)

        optimizer.step()
        optimizer.zero_grad()

        losses_.update(loss.data.item(), n)
        top1_.update(prec1.data.item(), n)
        top5_.update(prec1.data.item(), n)

        postfix = {
            'train_loss': '%.6f' % (losses_.avg),
            'train_acc1': '%.6f' % top1_.avg,
            'train_acc5': '%.6f' % top5_.avg
        }

        train_loader.set_postfix(log=postfix)

        if args.local_rank == 0 and step % 10 == 0 and writer is not None:
            writer.add_scalar(
                "Train/loss", losses_.avg,
                step + len(train_dataloader) * epoch * args.batch_size)
            writer.add_scalar(
                "Train/acc1", top1_.avg,
                step + len(train_dataloader) * epoch * args.batch_size)
            writer.add_scalar(
                "Train/acc5", top5_.avg,
                step + len(train_loader) * args.batch_size * epoch)
Exemplo n.º 11
0
def validate(model, device, args, *, all_iters=None, arch_loader=None):
    assert arch_loader is not None

    objs = AvgrageMeter()
    top1 = AvgrageMeter()
    top5 = AvgrageMeter()

    loss_function = args.loss_function
    val_loader = args.val_loader

    model.eval()
    t1 = time.time()

    result_dict = {}

    arch_dict = arch_loader.get_part_dict()

    with torch.no_grad():
        for ii, (key, value) in enumerate(arch_dict.items()):
            for data, target in val_loader:
                target = target.type(torch.LongTensor)
                data, target = data.to(device), target.to(device)

                output = model(data, value["arch"])
                loss = loss_function(output, target)

                acc1, acc5 = accuracy(output, target, topk=(1, 5))
                n = data.size(0)
                objs.update(loss.item(), n)

                top1.update(acc1.item(), n)
                top5.update(acc5.item(), n)

            if ii % 5:
                logging.info("validate acc:{:.6f} iter:{}".format(
                    top1.avg / 100, ii))
                writer.add_scalar(
                    "Val/Loss", loss.item(),
                    all_iters * len(val_loader) * args.batch_size + ii)
                writer.add_scalar(
                    "Val/acc1", acc1.item(),
                    all_iters * len(val_loader) * args.batch_size + ii)
                writer.add_scalar(
                    "Val/acc5", acc5.item(),
                    all_iters * len(val_loader) * args.batch_size + ii)

            result_dict[key] = top1.avg

    logInfo = 'TEST Iter {}: loss = {:.6f},\t'.format(all_iters, objs.avg) + \
              'Top-1 acc = {:.6f},\t'.format(top1.avg) + \
              'Top-5 acc = {:.6f},\t'.format(top5.avg) + \
              'val_time = {:.6f}'.format(time.time() - t1)
    logging.info(logInfo)

    logging.info("RESULTS")
    for ii, (key, value) in enumerate(result_dict.items()):
        logging.info("{: ^10}  \t  {:.6f}".format(key, value))
        if ii > 10:
            break
    logging.info("E N D")