示例#1
0
    def train_model(self, model, train_data, test_data, number_of_classes,
                    y_train, counts):
        #Do we need? Yes we dp
        # model = torch.nn.DataParallel(model)
        criterion = nn.CrossEntropyLoss().cuda()

        #optimizer = optim.Adadelta(model.parameters(), lr = self.learning_rate)#, weight_decay = self.weight_decay)
        #optimizer = optim.Adam(model.parameters(), lr = self.learning_rate, weight_decay = self.weight_decay)
        optimizer = optim.RMSprop(model.parameters(),
                                  lr=self.learning_rate,
                                  weight_decay=self.weight_decay)
        best_so_far = 0
        state = None
        for i_epoch in range(self.epoch):
            adjust_learning_rate(self.learning_rate, optimizer, i_epoch)
            train_loss, batches = self.train(model, train_data, criterion,
                                             optimizer, i_epoch, best_so_far,
                                             counts, number_of_classes)
            best_so_far, test_loss, test_batches, precision, recall, f1, accuracy, state = self.test(
                model, test_data, criterion, i_epoch, best_so_far, counts,
                number_of_classes, state)

            print(
                f"Epoch {i_epoch+1}/{self.epoch}, training loss: {train_loss/batches}, validation loss: {test_loss/test_batches}"
            )
            self.print_scores(precision, recall, f1, accuracy, test_batches)
        if not os.path.isdir(self.trainer_logs_out + self.checkpoint_file):
            os.mkdir(self.trainer_logs_out + self.checkpoint_file)
        torch.save(state,
                   self.trainer_logs_out + self.checkpoint_file + 'ckpt.t7')

        return model
示例#2
0
def main_stage1():
    print(f"\nStart Stage-1 training ...\n")
    # for  initializing backbone, two branches, and centroids.
    start_epoch = 0  # start from epoch 0 or last checkpoint epoch

    # Model
    print('==> Building model..')
    net = DFPNet(backbone=args.arch, num_classes=args.train_class_num, embed_dim=args.embed_dim,
                 distance=args.distance, scaled=args.scaled)
    criterion = DFPLoss(alpha=args.alpha, beta=args.beta)
    optimizer = optim.SGD(net.parameters(), lr=args.stage1_lr, momentum=0.9, weight_decay=5e-4)

    net = net.to(device)
    if device == 'cuda':
        net = torch.nn.DataParallel(net)
        cudnn.benchmark = True

    if args.stage1_resume:
        # Load checkpoint.
        if os.path.isfile(args.stage1_resume):
            print('==> Resuming from checkpoint..')
            checkpoint = torch.load(args.stage1_resume)
            net.load_state_dict(checkpoint['net'])
            start_epoch = checkpoint['epoch']
            logger = Logger(os.path.join(args.checkpoint, 'log_stage1.txt'), resume=True)
        else:
            print("=> no checkpoint found at '{}'".format(args.resume))
    else:
        logger = Logger(os.path.join(args.checkpoint, 'log_stage1.txt'))
        logger.set_names(['Epoch', 'Train Loss', 'Softmax Loss', 'Within Loss', 'Between Loss', 'Train Acc.'])

    if not args.evaluate:
        for epoch in range(start_epoch, args.stage1_es):
            adjust_learning_rate(optimizer, epoch, args.stage1_lr, step=15)
            print('\nStage_1 Epoch: %d | Learning rate: %f ' % (epoch + 1, optimizer.param_groups[0]['lr']))
            train_out = stage1_train(net, trainloader, optimizer, criterion, device)
            save_model(net, epoch, os.path.join(args.checkpoint,'stage_1_last_model.pth'))
            logger.append([epoch + 1, train_out["train_loss"], train_out["cls_loss"], train_out["dis_loss_within"],
                           train_out["dis_loss_between"], train_out["accuracy"]])
            if args.plot:
                plot_feature(net, trainloader, device, args.plotfolder1, epoch=epoch,
                             plot_class_num=args.train_class_num, maximum=args.plot_max,
                             plot_quality=args.plot_quality,normalized=args.plot_normalized)
    if args.plot:
        # plot the test set
        plot_feature(net, testloader, device, args.plotfolder1, epoch="test",
                     plot_class_num=args.train_class_num + 1, maximum=args.plot_max,
                     plot_quality=args.plot_quality, normalized=args.plot_normalized)

    # calculating distances for last epoch
    distance_results = plot_distance(net, trainloader, device, args)

    logger.close()
    print(f"\nFinish Stage-1 training...\n")
    print("===> Evaluating ...")
    stage1_test(net, testloader, device)

    return {"net": net,
            "distance": distance_results
            }
示例#3
0
def main_stage1():
    print(f"\nStart Stage-1 training...\n")
    start_epoch = 0  # start from epoch 0 or last checkpoint epoch
    # data loader

    # Model
    print('==> Building model..')
    net = Network(backbone=args.arch,
                  embed_dim=512,
                  num_classes=args.train_class_num,
                  use_fc=False,
                  attmodule=False,
                  classifier='dotproduct',
                  backbone_fc=False,
                  data_shape=4)
    # net = models.__dict__[args.arch](num_classes=args.train_class_num) # CIFAR 100
    net = net.to(device)

    if device == 'cuda':
        net = torch.nn.DataParallel(net)
        cudnn.benchmark = True

    if args.stage1_resume:
        # Load checkpoint.
        if os.path.isfile(args.stage1_resume):
            print('==> Resuming from checkpoint..')
            checkpoint = torch.load(args.stage1_resume)
            net.load_state_dict(checkpoint['net'])
            # best_acc = checkpoint['acc']
            # print("BEST_ACCURACY: "+str(best_acc))
            start_epoch = checkpoint['epoch']
            logger = Logger(os.path.join(args.checkpoint, 'log_stage1.txt'),
                            resume=True)
        else:
            print("=> no checkpoint found at '{}'".format(args.resume))
    else:
        logger = Logger(os.path.join(args.checkpoint, 'log_stage1.txt'))
        logger.set_names(
            ['Epoch', 'Learning Rate', 'Train Loss', 'Train Acc.'])

    criterion = nn.CrossEntropyLoss()
    optimizer = optim.SGD(net.parameters(),
                          lr=args.lr,
                          momentum=0.9,
                          weight_decay=5e-4)

    for epoch in range(start_epoch, args.stage1_es):
        print('\nStage_1 Epoch: %d   Learning rate: %f' %
              (epoch + 1, optimizer.param_groups[0]['lr']))
        adjust_learning_rate(optimizer, epoch, args.lr, step=10)
        train_loss, train_acc = stage1_train(net, trainloader, optimizer,
                                             criterion, device)
        save_model(net, None, epoch,
                   os.path.join(args.checkpoint, 'stage_1_last_model.pth'))
        logger.append([
            epoch + 1, optimizer.param_groups[0]['lr'], train_loss, train_acc
        ])
    logger.close()
    print(f"\nFinish Stage-1 training...\n")
    return net
def train(train_loader, model, optimizer, epoch, logger):
    # switch to train mode
    model.train()
    pbar = tqdm(enumerate(train_loader))
    for batch_idx, (data_a, data_p, data_n) in pbar:
        if args.cuda:
            data_a, data_p, data_n = data_a.cuda(), data_p.cuda(), data_n.cuda()

        data_a, data_p, data_n = Variable(data_a), Variable(data_p), Variable(data_n)

        out_a, out_p, out_n = model(data_a), model(data_p), model(data_n)

        #hardnet loss
        loss = F.triplet_margin_loss(out_p, out_a, out_n, margin=args.margin, swap=args.anchorswap)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        adjust_learning_rate(optimizer, args)
        if(logger!=None):
         logger.log_value('loss', loss.data[0]).step()

        if batch_idx % args.log_interval == 0:
            pbar.set_description(
                'Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
                    epoch, batch_idx * len(data_a), len(train_loader.dataset),
                           100. * batch_idx / len(train_loader),
                    loss.data[0]))

    torch.save({'epoch': epoch + 1, 'state_dict': model.state_dict()},
               '{}/checkpoint_{}.pth'.format(LOG_DIR, epoch))
def main_stage2(net, mid_energy):
    print("Starting stage-2 fine-tuning ...")
    if args.stage2_resume:
        # Load checkpoint.
        if os.path.isfile(args.stage1_resume):
            print('==> Resuming from checkpoint..')
            checkpoint = torch.load(args.stage1_resume)
            net.load_state_dict(checkpoint['net'])
            start_epoch = checkpoint['epoch']
            logger = Logger(os.path.join(args.checkpoint, 'log_stage1.txt'),
                            resume=True)
        else:
            print("=> no checkpoint found at '{}'".format(args.resume))
    else:
        logger = Logger(os.path.join(args.checkpoint, 'log_stage1.txt'))
        logger.set_names(['Epoch', 'Train Loss', 'Train Acc.'])

    # after resume
    criterion = DFPLoss(temperature=args.temperature)
    optimizer = torch.optim.SGD(net.parameters(),
                                lr=args.stage1_lr,
                                momentum=0.9,
                                weight_decay=5e-4)
    if not args.evaluate:
        for epoch in range(start_epoch, args.stage1_es):
            adjust_learning_rate(optimizer,
                                 epoch,
                                 args.stage1_lr,
                                 factor=args.stage1_lr_factor,
                                 step=args.stage1_lr_step)
            print('\nStage_1 Epoch: %d | Learning rate: %f ' %
                  (epoch + 1, optimizer.param_groups[0]['lr']))
            train_out = stage1_train(net, trainloader, optimizer, criterion,
                                     device)
            save_model(net, epoch,
                       os.path.join(args.checkpoint, 'stage_1_last_model.pth'))
            logger.append(
                [epoch + 1, train_out["train_loss"], train_out["accuracy"]])
            if args.plot:
                plot_feature(net,
                             args,
                             trainloader,
                             device,
                             args.plotfolder,
                             epoch=epoch,
                             plot_class_num=args.train_class_num,
                             plot_quality=args.plot_quality)
                plot_feature(net,
                             args,
                             testloader,
                             device,
                             args.plotfolder,
                             epoch="test" + str(epoch),
                             plot_class_num=args.train_class_num + 1,
                             plot_quality=args.plot_quality,
                             testmode=True)
        logger.close()
        print(f"\nFinish Stage-1 training...\n")
def main():
    print(f"\nStart  training ...\n")
    start_epoch = 0  # start from epoch 0 or last checkpoint epoch
    print('==> Building model..')
    net = BuildNet(backbone=args.arch,
                   num_classes=args.train_class_num,
                   embed_dim=args.embed_dim)
    net = net.to(device)
    if device == 'cuda':
        net = torch.nn.DataParallel(net)
        cudnn.benchmark = True

    optimizer = torch.optim.SGD(net.parameters(),
                                lr=args.lr,
                                momentum=0.9,
                                weight_decay=5e-4)

    if args.resume:
        # Load checkpoint.
        if os.path.isfile(args.resume):
            print('==> Resuming from checkpoint..')
            checkpoint = torch.load(args.resume)
            net.load_state_dict(checkpoint['net'])
            optimizer.load_state_dict(checkpoint['optimizer'])
            start_epoch = checkpoint['epoch']
            logger = Logger(os.path.join(args.checkpoint, 'log.txt'),
                            resume=True)
        else:
            print("=> no checkpoint found at '{}'".format(args.resume))
    else:
        logger = Logger(os.path.join(args.checkpoint, 'log.txt'))
        logger.set_names(
            ['Epoch', 'Train Loss', 'Train Acc.', "Test F1", 'threshold'])

    if not args.evaluate:
        for epoch in range(start_epoch, args.es):
            adjust_learning_rate(optimizer,
                                 epoch,
                                 args.lr,
                                 factor=args.lr_factor,
                                 step=args.lr_step)
            print('\nEpoch: %d | Learning rate: %f ' %
                  (epoch + 1, optimizer.param_groups[0]['lr']))
            train_out = train(net, trainloader, optimizer, criterion, device)
            save_model(net, optimizer, epoch,
                       os.path.join(args.checkpoint, 'last_model.pth'))
            test_out = test(net, testloader, criterion, device)
            logger.append([
                epoch + 1, train_out["train_loss"], train_out["accuracy"],
                test_out["best_F1"], test_out["best_thres"]
            ])
        logger.close()
        print(f"\nFinish training...\n")

    else:
        print("===> Evaluating ...")
        test(net, testloader, criterion, device)
def main_stage1():
    print(f"\nStart Stage-1 training ...\n")
    start_epoch = 0  # start from epoch 0 or last checkpoint epoch
    print('==> Building model..')
    net = DFPNet(backbone=args.arch, num_classes=args.train_class_num, embed_dim=args.embed_dim, p=args.p)
    net = net.to(device)
    if device == 'cuda':
        net = torch.nn.DataParallel(net)
        cudnn.benchmark = True

    criterion = DFPLoss(temperature=args.temperature)
    optimizer = torch.optim.SGD(net.parameters(), lr=args.stage1_lr, momentum=0.9, weight_decay=5e-4)

    if args.stage1_resume:
        # Load checkpoint.
        if os.path.isfile(args.stage1_resume):
            print('==> Resuming from checkpoint..')
            checkpoint = torch.load(args.stage1_resume)
            net.load_state_dict(checkpoint['net'])
            optimizer.load_state_dict(checkpoint['optimizer'])
            start_epoch = checkpoint['epoch']
            logger = Logger(os.path.join(args.checkpoint, 'log_stage1.txt'), resume=True)
        else:
            print("=> no checkpoint found at '{}'".format(args.resume))
    else:
        logger = Logger(os.path.join(args.checkpoint, 'log_stage1.txt'))
        logger.set_names(['Epoch', 'Train Loss', 'Train Acc.'])

    if not args.evaluate:
        for epoch in range(start_epoch, args.stage1_es):
            adjust_learning_rate(optimizer, epoch, args.stage1_lr,
                                 factor=args.stage1_lr_factor, step=args.stage1_lr_step)
            print('\nStage_1 Epoch: %d | Learning rate: %f ' % (epoch + 1, optimizer.param_groups[0]['lr']))
            train_out = stage1_train(net, trainloader, optimizer, criterion, device)
            save_model(net, optimizer, epoch, os.path.join(args.checkpoint, 'stage_1_last_model.pth'))
            logger.append([epoch + 1, train_out["train_loss"], train_out["accuracy"]])
            if args.plot:
                plot_feature(net, args, trainloader, device, args.plotfolder, epoch=epoch,
                             plot_class_num=args.train_class_num, plot_quality=args.plot_quality)
                plot_feature(net, args, testloader, device, args.plotfolder, epoch="test" + str(epoch),
                             plot_class_num=args.train_class_num + 1, plot_quality=args.plot_quality, testmode=True)
        logger.close()
        print(f"\nFinish Stage-1 training...\n")

    print("===> Evaluating stage-1 ...")
    stage_test(net, testloader, device)
    mid_dict = stage_valmixup(net, trainloader, device)
    print("===> stage1 energy based classification")
    stage_evaluate(net, testloader, mid_dict["mid_unknown"].item(), mid_dict["mid_known"].item(), feature="energy")
    print("===> stage1 softmax based classification")
    stage_evaluate(net, testloader, 0., 1., feature="normweight_fea2cen")
    return {
        "net": net.state_dict(),
        "mid_known": mid_dict["mid_known"],
        "mid_unknown": mid_dict["mid_unknown"]
    }
示例#8
0
文件: mnist.py 项目: 13952522076/OSR
def main():
    print(f"------------Running on {device}------------")
    print('==> Building model..')
    net = NetBuilder(backbone=args.arch, embed_dim=args.embed_dim)
    net = net.to(device)
    center = calcuate_center(net, trainloader, device)
    net._init_centroid(center)
    start_epoch = 0  # start from epoch 0 or last checkpoint epoch
    if device == 'cuda':
        net = torch.nn.DataParallel(net)
        cudnn.benchmark = True
    if args.resume:
        # Load checkpoint.
        if os.path.isfile(args.stage1_resume):
            print('==> Resuming from checkpoint..')
            checkpoint = torch.load(args.resume)
            net.load_state_dict(checkpoint['net'])
            start_epoch = checkpoint['epoch']
            logger = Logger(os.path.join(args.checkpoint, 'log.txt'),
                            resume=True)
        else:
            print("=> no checkpoint found at '{}'".format(args.resume))
    else:
        logger = Logger(os.path.join(args.checkpoint, 'log.txt'))
        logger.set_names(['Epoch', 'Train Loss'])

    criterion = DSVDDLoss()
    optimizer = optim.SGD(net.parameters(),
                          lr=args.lr,
                          momentum=0.9,
                          weight_decay=5e-4)

    for epoch in range(start_epoch, args.epochs):
        adjust_learning_rate(optimizer, epoch, args.lr, step=20)
        print('\nStage_1 Epoch: %d | Learning rate: %f ' %
              (epoch + 1, optimizer.param_groups[0]['lr']))
        train_loss = train(net, trainloader, optimizer, criterion, device)
        save_model(net, epoch, os.path.join(args.checkpoint, 'last_model.pth'))
        logger.append([epoch + 1, train_loss])
        if args.plot:
            # plot training set
            plot_feature(net,
                         args,
                         trainloader,
                         device,
                         args.plotfolder,
                         epoch=epoch,
                         plot_quality=150)
            # plot testing set
            plot_feature(net,
                         args,
                         testloader,
                         device,
                         args.plotfolder,
                         epoch='test_' + str(epoch),
                         plot_quality=150)
def main_stage1():
    print(f"\nStart Stage-1 training ...\n")
    # for  initializing backbone, two branches, and centroids.
    start_epoch = 0  # start from epoch 0 or last checkpoint epoch

    # Model
    print('==> Building model..')
    net = DFPNet(backbone=args.arch, num_classes=args.train_class_num, embed_dim=args.embed_dim)

    net = net.to(device)
    if device == 'cuda':
        net = torch.nn.DataParallel(net)
        cudnn.benchmark = True

    if args.stage1_resume:
        # Load checkpoint.
        if os.path.isfile(args.stage1_resume):
            print('==> Resuming from checkpoint..')
            checkpoint = torch.load(args.stage1_resume)
            net.load_state_dict(checkpoint['net'])
            start_epoch = checkpoint['epoch']
            logger = Logger(os.path.join(args.checkpoint, 'log_stage1.txt'), resume=True)
        else:
            print("=> no checkpoint found at '{}'".format(args.resume))
    else:
        logger = Logger(os.path.join(args.checkpoint, 'log_stage1.txt'))
        logger.set_names(['Epoch', 'Train Loss', 'Train Acc.'])

    # after resume
    criterion = DFPLoss(scaling=args.scaling)
    optimizer = optim.Adam(net.parameters(), lr=args.stage1_lr)

    for epoch in range(start_epoch, args.stage1_es):
        adjust_learning_rate(optimizer, epoch, args.stage1_lr, factor=0.2, step=20)
        print('\nStage_1 Epoch: %d | Learning rate: %f ' % (epoch + 1, optimizer.param_groups[0]['lr']))
        train_out = stage1_train(net, trainloader, optimizer, criterion, device)
        save_model(net, epoch, os.path.join(args.checkpoint, 'stage_1_last_model.pth'))
        logger.append([epoch + 1, train_out["train_loss"], train_out["accuracy"]])
        if args.plot:
            plot_feature(net, args, trainloader, device, args.plotfolder, epoch=epoch,
                         plot_class_num=args.train_class_num, plot_quality=args.plot_quality)
            plot_feature(net, args, testloader, device, args.plotfolder, epoch="test" + str(epoch),
                         plot_class_num=args.train_class_num + 1, plot_quality=args.plot_quality, testmode=True)

    logger.close()
    print(f"\nFinish Stage-1 training...\n")
    print("===> Evaluating ...")
    stage1_test(net, testloader, device)

    return {
        "net": net
    }
示例#10
0
def main_stage1():
    print(f"\nStart Stage-1 training ...\n")
    # for  initializing backbone, two branches, and centroids.
    start_epoch = 0  # start from epoch 0 or last checkpoint epoch

    # Model
    print('==> Building model..')
    net = DFPNet(backbone=args.arch, num_classes=args.train_class_num, embed_dim=args.embed_dim,
                 distance=args.distance, scaled=args.scaled, cosine_weight=args.cosine_weight)
    # embed_dim = net.feat_dim if not args.embed_dim else args.embed_dim
    # criterion_cls = nn.CrossEntropyLoss()
    criterion_dis = DFPLoss(beta=args.beta, sigma=args.sigma)
    optimizer = optim.SGD(net.parameters(), lr=args.stage1_lr, momentum=0.9, weight_decay=5e-4)

    net = net.to(device)
    if device == 'cuda':
        net = torch.nn.DataParallel(net)
        cudnn.benchmark = True

    if args.stage1_resume:
        # Load checkpoint.
        if os.path.isfile(args.stage1_resume):
            print('==> Resuming from checkpoint..')
            checkpoint = torch.load(args.stage1_resume)
            net.load_state_dict(checkpoint['net'])
            start_epoch = checkpoint['epoch']
            logger = Logger(os.path.join(args.checkpoint, 'log_stage1.txt'), resume=True)
        else:
            print("=> no checkpoint found at '{}'".format(args.resume))
    else:
        logger = Logger(os.path.join(args.checkpoint, 'log_stage1.txt'))
        logger.set_names(['Epoch', 'Train Loss', 'Softmax Loss', 'Distance Loss',
                          'Within Loss', 'Between Loss', 'Cen2cen Loss', 'Train Acc.'])

    for epoch in range(start_epoch, start_epoch + args.stage1_es):
        print('\nStage_1 Epoch: %d | Learning rate: %f ' % (epoch + 1, optimizer.param_groups[0]['lr']))
        adjust_learning_rate(optimizer, epoch, args.stage1_lr, step=15)
        train_out = stage1_train(net, trainloader, optimizer, criterion_dis, device)
        save_model(net, epoch, os.path.join(args.checkpoint,'stage_1_last_model.pth'))
        # ['Epoch', 'Train Loss', 'Softmax Loss', 'Distance Loss',
        # 'Within Loss', 'Between Loss','Cen2cen loss', 'Train Acc.']
        logger.append([epoch + 1, train_out["train_loss"], 0.0,
                       train_out["dis_loss_total"], train_out["dis_loss_within"],
                       train_out["dis_loss_between"], train_out["dis_loss_cen2cen"], train_out["accuracy"]])
        if args.plot:
            plot_feature(net, trainloader, device, args.plotfolder, epoch=epoch,
                         plot_class_num=args.train_class_num, maximum=args.plot_max,plot_quality=args.plot_quality)
    logger.close()
    print(f"\nFinish Stage-1 training...\n")
    return net
def main_stage2(net, mid_known, mid_unknown):
    print("Starting stage-2 fine-tuning ...")
    start_epoch = 0
    criterion = FinetuneLoss(mid_known=mid_known, mid_unknown=mid_unknown,
                            gamma=args.gamma, temperature=args.temperature, feature='energy')
    criterion = criterion.to(device)
    optimizer = torch.optim.SGD(net.parameters(), lr=args.stage2_lr, momentum=0.9, weight_decay=5e-4)
    if args.stage2_resume:
        # Load checkpoint.
        if os.path.isfile(args.stage2_resume):
            print('==> Resuming from checkpoint..')
            checkpoint = torch.load(args.stage2_resume)
            net.load_state_dict(checkpoint['net'])
            optimizer.load_state_dict(checkpoint['optimizer'])
            start_epoch = checkpoint['epoch']
            logger = Logger(os.path.join(args.checkpoint, 'log_stage2.txt'), resume=True)
        else:
            print("=> no checkpoint found at '{}'".format(args.stage2_resume))
    else:
        logger = Logger(os.path.join(args.checkpoint, 'log_stage2.txt'))
        logger.set_names(['Epoch', 'Train Loss', 'Class Loss', 'Energy Loss',
                          'Energy Known', 'Energy Unknown', 'Train Acc.', "Test F1"])

    if not args.evaluate:
        best_F1_list = []
        for epoch in range(start_epoch, args.stage2_es):
            adjust_learning_rate(optimizer, epoch, args.stage2_lr,
                                 factor=args.stage2_lr_factor, step=args.stage2_lr_step)
            print('\nStage_2 Epoch: %d | Learning rate: %f ' % (epoch + 1, optimizer.param_groups[0]['lr']))
            train_out = stage2_train(net, trainloader, optimizer, criterion, device)

            save_model(net, optimizer, epoch, os.path.join(args.checkpoint, 'stage_2_last_model.pth'))
            # test_out = test_with_hist(net, testloader, device, name=f"stage2_test{epoch}")
            test_out = test(net, testloader, device)
            # stage_valmixup(net, trainloader, device, name=f"stage2_mixup{epoch}")
            logger.append([epoch + 1, train_out["train_loss"], train_out["loss_classification"],
                           train_out["loss_energy"], train_out["loss_energy_known"],
                           train_out["loss_energy_unknown"], train_out["accuracy"],
                           test_out["best_F1"]
                           ])
            best_F1_list.append(test_out["best_F1"])
        logger.close()
        print(f"\nFinish Stage-2 training...\n")
        last_five = np.array(best_F1_list[-5:])
        print(f"\nGamma:{args.gamma} | F1_mean: {last_five.mean()} | F1_std: {last_five.std()}")
示例#12
0
def main_stage2(stage1_dict):
    net1 = stage1_dict['net']
    thresholds = stage1_dict['distance']['thresholds']
    estimator = stage1_dict['estimator']
    print(f"\n===> Start Stage-2 training...\n")
    start_epoch = 0  # start from epoch 0 or last checkpoint epoch
    print('==> Building model..')
    net2 = DFPNet(backbone=args.arch,
                  num_classes=args.train_class_num,
                  embed_dim=args.embed_dim,
                  distance=args.distance,
                  similarity=args.similarity,
                  scaled=args.scaled,
                  thresholds=thresholds,
                  norm_centroid=args.norm_centroid,
                  amplifier=args.amplifier,
                  estimator=estimator)
    net2 = net2.to(device)
    if not args.evaluate and not os.path.isdir(args.stage2_resume):
        init_stage2_model(net1, net2)

    if device == 'cuda':
        net2 = torch.nn.DataParallel(net2)
        cudnn.benchmark = True

    if args.stage2_resume:
        # Load checkpoint.
        if os.path.isfile(args.stage2_resume):
            print('==> Resuming from checkpoint..')
            checkpoint = torch.load(args.stage2_resume)
            net2.load_state_dict(checkpoint['net'])
            # best_acc = checkpoint['acc']
            # print("BEST_ACCURACY: "+str(best_acc))
            start_epoch = checkpoint['epoch']
            logger = Logger(os.path.join(args.checkpoint, 'log_stage2.txt'),
                            resume=True)
        else:
            print("=> no checkpoint found at '{}'".format(args.resume))
    else:
        logger = Logger(os.path.join(args.checkpoint, 'log_stage2.txt'))
        logger.set_names([
            'Epoch', 'Train Loss', 'Similarity Loss', 'Distance in',
            'Distance out', 'Generate within', 'Generate 2origin', 'Train Acc.'
        ])

    # after resume
    criterion = DFPLoss2(alpha=args.alpha, beta=args.beta, theta=args.theta)
    optimizer = optim.SGD(net2.parameters(),
                          lr=args.stage1_lr,
                          momentum=0.9,
                          weight_decay=5e-4)

    if not args.evaluate:
        for epoch in range(start_epoch, args.stage2_es):
            print('\nStage_2 Epoch: %d   Learning rate: %f' %
                  (epoch + 1, optimizer.param_groups[0]['lr']))
            # Here, I didn't set optimizers respectively, just for simplicity. Performance did not vary a lot.
            adjust_learning_rate(optimizer, epoch, args.stage2_lr, step=10)
            train_out = stage2_train(net2, trainloader, optimizer, criterion,
                                     device)
            save_model(net2, epoch,
                       os.path.join(args.checkpoint, 'stage_2_last_model.pth'))
            logger.append([
                epoch + 1, train_out["train_loss"],
                train_out["loss_similarity"], train_out["distance_in"],
                train_out["distance_out"], train_out["generate_within"],
                train_out["generate_2orign"], train_out["accuracy"]
            ])
            if args.plot:
                plot_feature(net2,
                             args,
                             trainloader,
                             device,
                             args.plotfolder2,
                             epoch=epoch,
                             plot_class_num=args.train_class_num,
                             maximum=args.plot_max,
                             plot_quality=args.plot_quality,
                             norm_centroid=args.norm_centroid,
                             thresholds=thresholds)
                plot_feature(net2,
                             args,
                             testloader,
                             device,
                             args.plotfolder2,
                             epoch="test_" + str(epoch),
                             plot_class_num=args.train_class_num + 1,
                             maximum=args.plot_max,
                             plot_quality=args.plot_quality,
                             norm_centroid=args.norm_centroid,
                             thresholds=thresholds,
                             testmode=True)
        if args.plot:
            # plot the test set
            plot_feature(net2,
                         args,
                         testloader,
                         device,
                         args.plotfolder2,
                         epoch="test",
                         plot_class_num=args.train_class_num + 1,
                         maximum=args.plot_max,
                         plot_quality=args.plot_quality,
                         norm_centroid=args.norm_centroid,
                         thresholds=thresholds,
                         testmode=True)
        print(f"\nFinish Stage-2 training...\n")

    logger.close()

    # test2(net2, testloader, device)
    return net2
def main_stage2(net, mid_known, mid_unknown):
    print("Starting stage-2 fine-tuning ...")
    start_epoch = 0
    criterion = DFPNormLoss(mid_known=1.3 * mid_known,
                            mid_unknown=0.7 * mid_unknown,
                            alpha=args.alpha,
                            temperature=args.temperature,
                            feature='energy')
    optimizer = torch.optim.SGD(net.parameters(),
                                lr=args.stage2_lr,
                                momentum=0.9,
                                weight_decay=5e-4)
    if args.stage2_resume:
        # Load checkpoint.
        if os.path.isfile(args.stage2_resume):
            print('==> Resuming from checkpoint..')
            checkpoint = torch.load(args.stage2_resume)
            net.load_state_dict(checkpoint['net'])
            optimizer.load_state_dict(checkpoint['optimizer'])
            start_epoch = checkpoint['epoch']
            logger = Logger(os.path.join(args.checkpoint, 'log_stage2.txt'),
                            resume=True)
        else:
            print("=> no checkpoint found at '{}'".format(args.resume))
    else:
        logger = Logger(os.path.join(args.checkpoint, 'log_stage2.txt'))
        logger.set_names([
            'Epoch', 'Train Loss', 'Class Loss', 'Energy Loss', 'Energy Known',
            'Energy Unknown', 'Train Acc.'
        ])

    if not args.evaluate:
        for epoch in range(start_epoch, args.stage2_es):
            adjust_learning_rate(optimizer,
                                 epoch,
                                 args.stage2_lr,
                                 factor=args.stage2_lr_factor,
                                 step=args.stage2_lr_step)
            print('\nStage_2 Epoch: %d | Learning rate: %f ' %
                  (epoch + 1, optimizer.param_groups[0]['lr']))
            train_out = stage2_train(net, trainloader, optimizer, criterion,
                                     device)
            save_model(net, optimizer, epoch,
                       os.path.join(args.checkpoint, 'stage_2_last_model.pth'))
            logger.append([
                epoch + 1, train_out["train_loss"],
                train_out["loss_classification"], train_out["loss_energy"],
                train_out["loss_energy_known"],
                train_out["loss_energy_unknown"], train_out["accuracy"]
            ])
            if args.plot:
                plot_feature(net,
                             args,
                             trainloader,
                             device,
                             args.plotfolder,
                             epoch="stage2_" + str(epoch),
                             plot_class_num=args.train_class_num,
                             plot_quality=args.plot_quality)
                plot_feature(net,
                             args,
                             testloader,
                             device,
                             args.plotfolder,
                             epoch="stage2_test" + str(epoch),
                             plot_class_num=args.train_class_num + 1,
                             plot_quality=args.plot_quality,
                             testmode=True)
        logger.close()
        print(f"\nFinish Stage-2 training...\n")

        print("===> Evaluating stage-2 ...")
        stage_test(net, testloader, device, name="stage2_test_doublebar")
        stage_valmixup(net, trainloader, device, name="stage2_mixup_result")
示例#14
0
def main():
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    print(device)
    best_acc = 0  # best test accuracy
    start_epoch = 0  # start from epoch 0 or last checkpoint epoch

    # checkpoint
    args.checkpoint = './checkpoints/cifar/' + args.arch
    if not os.path.isdir(args.checkpoint):
        mkdir_p(args.checkpoint)

    # Data
    print('==> Preparing data..')
    transform_train = transforms.Compose([
        transforms.RandomCrop(32, padding=4),
        transforms.RandomHorizontalFlip(),
        transforms.ToTensor(),
        transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
    ])

    transform_test = transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
    ])

    trainset = CIFAR100(root='../../data', train=True, download=True, transform=transform_train,
                        train_class_num=args.train_class_num, test_class_num=args.test_class_num,
                        includes_all_train_class=args.includes_all_train_class)
    trainloader = torch.utils.data.DataLoader(trainset, batch_size=args.bs, shuffle=True, num_workers=4)
    testset = CIFAR100(root='../../data', train=False, download=True, transform=transform_test,
                       train_class_num=args.train_class_num, test_class_num=args.test_class_num,
                       includes_all_train_class=args.includes_all_train_class)
    testloader = torch.utils.data.DataLoader(testset, batch_size=args.bs, shuffle=False, num_workers=4)


    # Model
    print('==> Building model..')
    net = models.__dict__[args.arch](num_classes=args.train_class_num) # CIFAR 100
    net = net.to(device)

    if device == 'cuda':
        net = torch.nn.DataParallel(net)
        cudnn.benchmark = True

    if args.resume:
        # Load checkpoint.
        if os.path.isfile(args.resume):
            print('==> Resuming from checkpoint..')
            checkpoint = torch.load(args.resume)
            net.load_state_dict(checkpoint['net'])
            # best_acc = checkpoint['acc']
            # print("BEST_ACCURACY: "+str(best_acc))
            start_epoch = checkpoint['epoch']
            logger = Logger(os.path.join(args.checkpoint, 'log.txt'), resume=True)
        else:
            print("=> no checkpoint found at '{}'".format(args.resume))
    else:
        logger = Logger(os.path.join(args.checkpoint, 'log.txt'))
        logger.set_names(['Epoch', 'Learning Rate', 'Train Loss','Train Acc.', 'Test Loss', 'Test Acc.'])

    criterion = nn.CrossEntropyLoss()
    optimizer = optim.SGD(net.parameters(), lr=args.lr, momentum=0.9, weight_decay=5e-4)

    # test(0, net, trainloader, testloader, criterion, device)
    epoch=0
    if not args.evaluate:
        for epoch in range(start_epoch, start_epoch + args.es):
            print('\nEpoch: %d   Learning rate: %f' % (epoch+1, optimizer.param_groups[0]['lr']))
            adjust_learning_rate(optimizer, epoch, args.lr)
            train_loss, train_acc = train(net,trainloader,optimizer,criterion,device)
            save_model(net, None, epoch, os.path.join(args.checkpoint,'last_model.pth'))
            test_loss, test_acc = 0, 0
            #
            logger.append([epoch+1, optimizer.param_groups[0]['lr'], train_loss, train_acc, test_loss, test_acc])

    test(epoch, net, trainloader, testloader, criterion, device)
    logger.close()
示例#15
0
def main_stage2(stage1_dict):
    print('==> Building stage2 model..')
    start_epoch = 0  # start from epoch 0 or last checkpoint epoch
    net = DFPNet(backbone=args.arch,
                 num_classes=args.train_class_num,
                 embed_dim=args.embed_dim,
                 distance=args.distance,
                 similarity=args.similarity,
                 scaled=args.scaled,
                 norm_centroid=args.norm_centroid,
                 decorrelation=args.decorrelation)
    net = net.to(device)
    if device == 'cuda':
        net = torch.nn.DataParallel(net)
        cudnn.benchmark = True

    if not args.evaluate and not os.path.isfile(args.stage2_resume):
        net = stage1_dict['net']
        net = net.to(device)
        thresholds = stage1_dict['distance']['thresholds']
        # stat = stage1_dict["stat"]
        net.module.set_threshold(thresholds.to(device))

    if args.stage2_resume:
        # Load checkpoint.
        if os.path.isfile(args.stage2_resume):
            print('==> Resuming from checkpoint..')
            checkpoint = torch.load(args.stage2_resume)
            net.load_state_dict(checkpoint['net'])
            start_epoch = checkpoint['epoch']
            try:
                thresholds = checkpoint['net']['thresholds']
            except:
                thresholds = checkpoint['net']['module.thresholds']
            net.module.set_threshold(thresholds.to(device))

            logger = Logger(os.path.join(args.checkpoint, 'log_stage2.txt'),
                            resume=True)
        else:
            print("=> no checkpoint found at '{}'".format(args.resume))
    else:
        logger = Logger(os.path.join(args.checkpoint, 'log_stage2.txt'))
        logger.set_names([
            'Epoch', 'Train Loss', 'Similarity Loss', 'Distance in',
            'Distance out', 'Distance Center', 'Train Acc.'
        ])

    if args.evaluate:
        stage2_test(net, testloader, device)
        return net

    # after resume
    criterion = DFPLoss2(alpha=args.alpha, beta=args.beta, theta=args.theta)
    optimizer = optim.SGD(net.parameters(),
                          lr=args.stage1_lr,
                          momentum=0.9,
                          weight_decay=5e-4)

    for epoch in range(start_epoch, args.stage2_es):
        print('\nStage_2 Epoch: %d   Learning rate: %f' %
              (epoch + 1, optimizer.param_groups[0]['lr']))
        # Here, I didn't set optimizers respectively, just for simplicity. Performance did not vary a lot.
        adjust_learning_rate(optimizer, epoch, args.stage2_lr, step=20)
        # if epoch %5 ==0:
        #     distance_results = plot_distance(net, trainloader, device, args)
        #     thresholds = distance_results['thresholds']
        #     net.module.set_threshold(thresholds.to(device))
        train_out = stage2_train(net, trainloader, optimizer, criterion,
                                 device)
        save_model(net, epoch,
                   os.path.join(args.checkpoint, 'stage_2_last_model.pth'))
        stage2_test(net, testloader, device)
        # stat = get_gap_stat(net2, trainloader, device, args)

        logger.append([
            epoch + 1, train_out["train_loss"], train_out["loss_similarity"],
            train_out["distance_in"], train_out["distance_out"],
            train_out["distance_center"], train_out["accuracy"]
        ])

    print(f"\nFinish Stage-2 training...\n")

    logger.close()
    stage2_test(net, testloader, device)
    return net
示例#16
0
def main_stage1():
    print(f"\nStart Stage-1 training ...\n")
    # for  initializing backbone, two branches, and centroids.
    start_epoch = 0  # start from epoch 0 or last checkpoint epoch

    # Model
    print('==> Building model..')
    net = DFPNet(backbone=args.arch,
                 num_classes=args.train_class_num,
                 embed_dim=args.embed_dim,
                 distance=args.distance,
                 similarity=args.similarity,
                 scaled=args.scaled,
                 norm_centroid=args.norm_centroid,
                 decorrelation=args.decorrelation)

    net = net.to(device)
    if device == 'cuda':
        net = torch.nn.DataParallel(net)
        cudnn.benchmark = True

    if args.stage1_resume:
        # Load checkpoint.
        if os.path.isfile(args.stage1_resume):
            print('==> Resuming from checkpoint..')
            checkpoint = torch.load(args.stage1_resume)
            net.load_state_dict(checkpoint['net'])
            start_epoch = checkpoint['epoch']
            logger = Logger(os.path.join(args.checkpoint, 'log_stage1.txt'),
                            resume=True)
        else:
            print("=> no checkpoint found at '{}'".format(args.resume))
    else:
        logger = Logger(os.path.join(args.checkpoint, 'log_stage1.txt'))
        logger.set_names([
            'Epoch', 'Train Loss', 'Similarity Loss', 'Distance Loss',
            'Train Acc.'
        ])

    # after resume
    criterion = DFPLoss(alpha=args.alpha)
    optimizer = optim.SGD(net.parameters(),
                          lr=args.stage1_lr,
                          momentum=0.9,
                          weight_decay=5e-4)

    for epoch in range(start_epoch, args.stage1_es):
        adjust_learning_rate(optimizer, epoch, args.stage1_lr, step=20)
        print('\nStage_1 Epoch: %d | Learning rate: %f ' %
              (epoch + 1, optimizer.param_groups[0]['lr']))
        train_out = stage1_train(net, trainloader, optimizer, criterion,
                                 device)
        save_model(net, epoch,
                   os.path.join(args.checkpoint, 'stage_1_last_model.pth'))
        logger.append([
            epoch + 1, train_out["train_loss"], train_out["loss_similarity"],
            train_out["loss_distance"], train_out["accuracy"]
        ])

    # calculating distances for last epoch
    distance_results = plot_distance(net, trainloader, device, args)
    # print(f"the distance thresholds are\n {distance_results['thresholds']}\n")
    # gap_results = plot_gap(net, trainloader, device, args)
    # stat = get_gap_stat(net, trainloader, device, args)
    # estimator =CGD_estimator(gap_results)

    logger.close()
    print(f"\nFinish Stage-1 training...\n")
    print("===> Evaluating ...")
    stage1_test(net, testloader, device)

    return {
        "net": net,
        "distance": distance_results,
        # "stat": stat
    }
示例#17
0
def main():
    print(f"\nStart  training ...\n")
    start_epoch = 0  # start from epoch 0 or last checkpoint epoch
    print('==> Building model..')
    net = BuildNet(backbone=args.arch,
                   num_classes=args.train_class_num,
                   embed_dim=args.embed_dim)
    net = net.to(device)
    if device == 'cuda':
        net = torch.nn.DataParallel(net)
        cudnn.benchmark = True

    optimizer = torch.optim.SGD(net.parameters(),
                                lr=args.lr,
                                momentum=0.9,
                                weight_decay=5e-4)

    if args.resume:
        # Load checkpoint.
        if os.path.isfile(args.resume):
            print('==> Resuming from checkpoint..')
            checkpoint = torch.load(args.resume)
            net.load_state_dict(checkpoint['net'])
            optimizer.load_state_dict(checkpoint['optimizer'])
            start_epoch = checkpoint['epoch']
            loggerList = []
            for i in range(args.train_class_num, args.test_class_num + 1):
                loggerList.append(
                    Logger(os.path.join(args.checkpoint, f'log{i}.txt'),
                           resume=True))
        else:
            print("=> no checkpoint found at '{}'".format(args.resume))
    else:
        loggerList = []
        for i in range(args.train_class_num, args.test_class_num + 1):
            logger = Logger(os.path.join(args.checkpoint, f'log{i}.txt'))
            logger.set_names([
                'Epoch', 'Train Loss', 'Train Acc.', "Pos-F1", 'Norm-F1',
                'Energy-F1'
            ])
            loggerList.append(logger)

    if not args.evaluate:
        for epoch in range(start_epoch, args.es):
            adjust_learning_rate(optimizer,
                                 epoch,
                                 args.lr,
                                 factor=args.lr_factor,
                                 step=args.lr_step)
            print('\nEpoch: %d | Learning rate: %f ' %
                  (epoch + 1, optimizer.param_groups[0]['lr']))
            train_out = train(net, trainloader, optimizer, criterion, device)
            save_model(net, optimizer, epoch,
                       os.path.join(args.checkpoint, 'last_model.pth'))

            for test_class_num in range(args.train_class_num,
                                        args.test_class_num + 1):
                testset = CIFAR10(
                    root='../../data',
                    train=False,
                    download=True,
                    transform=transform_test,
                    train_class_num=args.train_class_num,
                    test_class_num=test_class_num,
                    includes_all_train_class=args.includes_all_train_class)
                testloader = torch.utils.data.DataLoader(testset,
                                                         batch_size=args.bs,
                                                         shuffle=False,
                                                         num_workers=4)
                test_out = test(net, testloader, criterion, device)
                logger = loggerList[test_class_num - args.train_class_num]
                logger.append([
                    epoch + 1, train_out["train_loss"], train_out["accuracy"],
                    test_out["best_F1_possibility"], test_out["best_F1_norm"],
                    test_out["best_F1_energy"]
                ])
        logger.close()
        print(f"\nFinish training...\n")
示例#18
0
def main():
    # Training devices
    if args.gpu is not None:
        print('Using GPU %d' % args.gpu)
        os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
        os.environ["CUDA_VISIBLE_DEVICES"] = str(args.gpu)
    else:
        print('CPU mode')

    normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                     std=[0.229, 0.224, 0.225])

    train_transform = transforms.Compose([
        transforms.RandomResizedCrop(224),
        transforms.RandomHorizontalFlip(),
        transforms.ToTensor(),
        normalize,
    ])

    val_transform = transforms.Compose([
        transforms.RandomResizedCrop(224),
        transforms.RandomHorizontalFlip(),
        transforms.ToTensor(),
        normalize,
    ])

    # Load the dataset. When training, disable the random crop.
    train_data = MyDataLoader(transform=train_transform,
                              trainval='train',
                              data_path=args.pascal_path,
                              random_crops=0)
    train_loader = torch.utils.data.DataLoader(dataset=train_data,
                                               batch_size=args.batch,
                                               shuffle=True,
                                               num_workers=CORES)

    val_data = MyDataLoader(transform=val_transform,
                            trainval='test',
                            data_path=args.pascal_path,
                            random_crops=args.crops)
    val_loader = torch.utils.data.DataLoader(dataset=val_data,
                                             batch_size=args.batch,
                                             shuffle=False,
                                             num_workers=CORES)

    N = len(train_data.names)
    iter_per_epoch = int(N / args.batch)

    # Network initialize
    # finetune: freeze some layers and modify the fc layer.
    if args.finetune is not None:
        # Initialize the network
        net = models[args.model](pretrained=True)
        # Freeze conv layers
        for i, (name, param) in enumerate(net.named_parameters()):
            if 'conv' in name:
                param.requires_grad = False
        # Modify the fc layer
        in_channel = net.fc.in_features
        net.fc = torch.nn.Linear(in_features=in_channel, out_features=20)

    elif args.modelpath is not None:
        net = load_model_from_file(args.modelpath,
                                   model=args.model,
                                   load_fc=args.fc)

    else:
        net = models[args.model](pretrained=False, num_classes=20)

    if args.gpu is not None:
        net.cuda()

    criterion = torch.nn.MultiLabelSoftMarginLoss()
    optimizer = torch.optim.SGD(filter(lambda p: p.requires_grad,
                                       net.parameters()),
                                lr=args.lr,
                                momentum=0.9,
                                weight_decay=0.0001)

    ############## TRAINING ###############
    print("Start training, lr: %f, batch-size: %d" % (args.lr, args.batch))
    print("Model: " + args.model)
    print("Checkpoint Path: " + args.checkpoint)
    print("Time: " + prefix)
    if args.modelpath is not None:
        print("Training from past model: " + args.modelpath)

    # Train the Model
    steps = args.iter_start
    for epoch in range(iter_per_epoch * args.iter_start, args.epochs):
        adjust_learning_rate(optimizer,
                             epoch,
                             init_lr=args.lr,
                             step=20,
                             decay=0.1)

        mAP = []
        for i, (images, labels) in enumerate(train_loader):
            if args.gpu is not None:
                images = images.cuda()
                labels = labels.cuda()

            # Forward + Backward + Optimize
            optimizer.zero_grad()
            outputs = net(images)

            mAP.append(compute_mAP(labels.data, outputs.data))

            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            loss = loss.cpu().data.numpy()

            if steps % 100 == 0:
                print('[%d/%d] %d), Loss: %.3f, mAP %.2f%%' %
                      (epoch + 1, args.epochs, steps, loss,
                       100 * np.mean(mAP[-20:])))

            steps += 1

        if epoch % 5 == 0:
            filename = '%s/%s_%s_%03i.pth' % (args.checkpoint, args.model,
                                              prefix, epoch + 1)
            torch.save(net.state_dict(), filename)
            print('Saved: ' + args.checkpoint + "/" + filename)

            eval_map(net, None, val_loader, steps, args.gpu, args.crops)

        if os.path.exists(args.checkpoint + '/stop.txt'):
            # break without using CTRL+C
            break
示例#19
0
def main():
    global best_prec1, args

    args.gpu = 0
    args.world_size = 1

    if args.distributed:
        args.gpu = args.local_rank % torch.cuda.device_count()
        torch.cuda.set_device(args.gpu)
        torch.distributed.init_process_group(backend='nccl',
                                             init_method='env://')
        args.world_size = torch.distributed.get_world_size()

    args.total_batch_size = args.world_size * args.batch_size

    if not os.path.isdir(args.checkpoint) and args.local_rank == 0:
        mkdir_p(args.checkpoint)

    if args.fp16:
        assert torch.backends.cudnn.enabled, "fp16 mode requires cudnn backend to be enabled."

    if args.static_loss_scale != 1.0:
        if not args.fp16:
            print(
                "Warning:  if --fp16 is not used, static_loss_scale will be ignored."
            )

    # create model
    if args.pretrained:
        print("=> using pre-trained model '{}'".format(args.arch))
        model = models.__dict__[args.arch](pretrained=True)
    else:
        print("=> creating model '{}'".format(args.arch))
        model = Network(backbone=args.arch, num_classes=args.train_class_num)

    model = model.cuda()
    if args.fp16:
        model = network_to_half(model)
    if args.distributed:
        # shared param/delay all reduce turns off bucketing in DDP, for lower latency runs this can improve perf
        # for the older version of APEX please use shared_param, for newer one it is delay_allreduce
        model = DDP(model, delay_allreduce=True)

    # define loss function (criterion) and optimizer
    criterion = nn.CrossEntropyLoss().cuda()

    optimizer = torch.optim.SGD(model.parameters(),
                                args.lr,
                                momentum=args.momentum,
                                weight_decay=args.weight_decay)
    if args.fp16:
        optimizer = FP16_Optimizer(optimizer,
                                   static_loss_scale=args.static_loss_scale,
                                   dynamic_loss_scale=args.dynamic_loss_scale,
                                   verbose=False)

    # optionally resume from a checkpoint
    title = 'ImageNet-' + args.arch
    if args.resume:
        if os.path.isfile(args.resume):
            print("=> loading checkpoint '{}'".format(args.resume))
            checkpoint = torch.load(
                args.resume,
                map_location=lambda storage, loc: storage.cuda(args.gpu))
            args.start_epoch = checkpoint['epoch']
            best_prec1 = checkpoint['best_prec1']
            model.load_state_dict(checkpoint['state_dict'])
            optimizer.load_state_dict(checkpoint['optimizer'])
            print("=> loaded checkpoint '{}' (epoch {})".format(
                args.resume, checkpoint['epoch']))
            if args.local_rank == 0:
                logger = Logger(os.path.join(args.checkpoint, 'log.txt'),
                                title=title,
                                resume=True)
        else:
            print("=> no checkpoint found at '{}'".format(args.resume))
    else:
        if args.local_rank == 0:
            logger = Logger(os.path.join(args.checkpoint, 'log.txt'),
                            title=title)
            logger.set_names([
                'Learning Rate', 'Train Loss', 'Valid Loss', 'Train Acc.',
                'Valid Acc.', 'Valid Top5.'
            ])

    traindir = os.path.join(args.data, 'train')
    valdir = os.path.join(args.data, 'val')

    if (args.arch == "inception_v3"):
        crop_size = 299
        val_size = 320  # I chose this value arbitrarily, we can adjust.
    else:
        crop_size = 224
        val_size = 256

    pipe = HybridTrainPipe(batch_size=args.batch_size,
                           num_threads=args.workers,
                           device_id=args.local_rank,
                           data_dir=traindir,
                           crop=crop_size,
                           dali_cpu=args.dali_cpu)
    pipe.build()
    train_loader = DALIClassificationIterator(
        pipe, size=int(pipe.epoch_size("Reader") / args.world_size))

    # pipe = HybridValPipe(batch_size=args.batch_size, num_threads=args.workers, device_id=args.local_rank, data_dir=valdir, crop=crop_size, size=val_size)
    # pipe.build()
    # val_loader = DALIClassificationIterator(pipe, size=int(pipe.epoch_size("Reader") / args.world_size))

    # if args.evaluate:
    #     validate(val_loader, model, criterion)
    #     return

    total_time = AverageMeter()
    for epoch in range(args.start_epoch, args.epochs):
        # train for one epoch
        adjust_learning_rate(optimizer, epoch, args)

        if args.local_rank == 0:
            print('\nEpoch: [%d | %d] LR: %f' %
                  (epoch + 1, args.epochs, optimizer.param_groups[0]['lr']))

        [train_loss, train_acc,
         avg_train_time] = train(train_loader, model, criterion, optimizer,
                                 epoch)
        total_time.update(avg_train_time)
        # evaluate on validation set
        # [test_loss, prec1, prec5] = validate(val_loader, model, criterion)

        # remember best prec@1 and save checkpoint
        if args.local_rank == 0:
            # append logger file
            # logger.append([optimizer.param_groups[0]['lr'], train_loss, test_loss, train_acc, prec1, prec5])
            logger.append([
                optimizer.param_groups[0]['lr'], train_loss, 0.0, train_acc,
                0.0, 0.0
            ])

            # is_best = prec1 > best_prec1
            is_best = False
            # best_prec1 = max(prec1, best_prec1)
            save_checkpoint(
                {
                    'epoch': epoch,
                    'arch': args.arch,
                    'state_dict': model.state_dict(),
                    'best_prec1': best_prec1,
                    'optimizer': optimizer.state_dict(),
                },
                is_best,
                checkpoint=args.checkpoint,
                filename="checkpoint.pth.tar")
            # if epoch == args.epochs - 1:
            #     print('##Top-1 {0}\n'
            #           '##Top-5 {1}\n'
            #           '##Perf  {2}'.format(prec1, prec5, args.total_batch_size / total_time.avg))

        # reset DALI iterators
        train_loader.reset()
        # val_loader.reset()

    if args.local_rank == 0:
        logger.close()
def main_stage2(net, mid_known, mid_unknown):
    print("Starting stage-2 fine-tuning ...")
    start_epoch = 0
    criterion = FinetuneLoss(mid_known=mid_known,
                             mid_unknown=mid_unknown,
                             gamma=args.gamma,
                             temperature=args.temperature,
                             feature='energy')
    criterion = criterion.to(device)
    optimizer = torch.optim.SGD(net.parameters(),
                                lr=args.stage2_lr,
                                momentum=0.9,
                                weight_decay=5e-4)
    if args.stage2_resume:
        # Load checkpoint.
        if os.path.isfile(args.stage2_resume):
            print('==> Resuming from checkpoint..')
            checkpoint = torch.load(args.stage2_resume)
            net.load_state_dict(checkpoint['net'])
            optimizer.load_state_dict(checkpoint['optimizer'])
            start_epoch = checkpoint['epoch']
            loggerList = []
            for i in range(args.train_class_num, args.test_class_num + 1):
                loggerList.append(
                    Logger(os.path.join(args.checkpoint, f'log{i}_stage2.txt'),
                           resume=True))
        else:
            print("=> no checkpoint found at '{}'".format(args.stage2_resume))
    else:
        loggerList = []
        for i in range(args.train_class_num, args.test_class_num + 1):
            logger = Logger(os.path.join(args.checkpoint,
                                         f'log{i}_stage2.txt'))
            logger.set_names(
                ['Epoch', 'Train Loss', 'Train Acc.', 'Energy-F1'])
            loggerList.append(logger)

    if not args.evaluate:
        for epoch in range(start_epoch, args.stage2_es):
            adjust_learning_rate(optimizer,
                                 epoch,
                                 args.stage2_lr,
                                 factor=args.stage2_lr_factor,
                                 step=args.stage2_lr_step)
            print('\nStage_2 Epoch: %d | Learning rate: %f ' %
                  (epoch + 1, optimizer.param_groups[0]['lr']))
            train_out = stage2_train(net, trainloader, optimizer, criterion,
                                     device)
            save_model(net, optimizer, epoch,
                       os.path.join(args.checkpoint, 'stage_2_last_model.pth'))

            for test_class_num in range(args.train_class_num,
                                        args.test_class_num + 1):
                testset = CIFAR10(
                    root='../../data',
                    train=False,
                    download=True,
                    transform=transform_test,
                    train_class_num=args.train_class_num,
                    test_class_num=test_class_num,
                    includes_all_train_class=args.includes_all_train_class)
                testloader = torch.utils.data.DataLoader(
                    testset,
                    batch_size=args.stage2_bs,
                    shuffle=False,
                    num_workers=4)
                test_out = test(net, testloader, device)
                logger = loggerList[test_class_num - args.train_class_num]
                logger.append([
                    epoch + 1, train_out["train_loss"], train_out["accuracy"],
                    test_out["best_F1"]
                ])
        logger.close()
        print(f"\nFinish Stage-2 training...\n")
def main():
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    print(device)
    best_acc = 0  # best test accuracy
    start_epoch = 0  # start from epoch 0 or last checkpoint epoch

    # checkpoint
    args.checkpoint = './checkpoints/mnist/' + args.arch
    if not os.path.isdir(args.checkpoint):
        mkdir_p(args.checkpoint)

    # folder to save figures
    args.plotfolder = './checkpoints/mnist/' + args.arch + '/plotter'
    if not os.path.isdir(args.plotfolder):
        mkdir_p(args.plotfolder)

    # Data
    print('==> Preparing data..')
    transform = transforms.Compose(
        [transforms.ToTensor(),
         transforms.Normalize((0.1307, ), (0.3081, ))])

    trainset = MNIST(root='../../data',
                     train=True,
                     download=True,
                     transform=transform,
                     train_class_num=args.train_class_num,
                     test_class_num=args.test_class_num,
                     includes_all_train_class=args.includes_all_train_class)
    testset = MNIST(root='../../data',
                    train=False,
                    download=True,
                    transform=transform,
                    train_class_num=args.train_class_num,
                    test_class_num=args.test_class_num,
                    includes_all_train_class=args.includes_all_train_class)
    # data loader
    trainloader = torch.utils.data.DataLoader(trainset,
                                              batch_size=args.bs,
                                              shuffle=True,
                                              num_workers=4)
    testloader = torch.utils.data.DataLoader(testset,
                                             batch_size=args.bs,
                                             shuffle=False,
                                             num_workers=4)

    # Model
    net = Network(backbone=args.arch,
                  num_classes=args.train_class_num,
                  embed_dim=args.embed_dim)
    fea_dim = net.classifier.in_features
    net = net.to(device)

    if device == 'cuda':
        net = torch.nn.DataParallel(net)
        cudnn.benchmark = True

    if args.resume:
        # Load checkpoint.
        if os.path.isfile(args.resume):
            print('==> Resuming from checkpoint..')
            checkpoint = torch.load(args.resume)
            net.load_state_dict(checkpoint['net'])
            # best_acc = checkpoint['acc']
            # print("BEST_ACCURACY: "+str(best_acc))
            start_epoch = checkpoint['epoch']
            logger = Logger(os.path.join(args.checkpoint, 'log.txt'),
                            resume=True)
        else:
            print("=> no checkpoint found at '{}'".format(args.resume))
    else:
        logger = Logger(os.path.join(args.checkpoint, 'log.txt'))
        logger.set_names([
            'Epoch', 'Learning Rate', 'Train Loss', 'Train Acc.', 'Test Loss',
            'Test Acc.'
        ])

    criterion = nn.CrossEntropyLoss()
    optimizer = optim.SGD(net.parameters(),
                          lr=args.lr,
                          momentum=0.9,
                          weight_decay=5e-4)

    # test(0, net, trainloader, testloader, criterion, device)
    epoch = 0
    if not args.evaluate:
        for epoch in range(start_epoch, args.es):
            print('\nEpoch: %d   Learning rate: %f' %
                  (epoch + 1, optimizer.param_groups[0]['lr']))
            adjust_learning_rate(optimizer, epoch, args.lr, step=20)
            train_loss, train_acc = train(net, trainloader, optimizer,
                                          criterion, device)
            save_model(net, None, epoch,
                       os.path.join(args.checkpoint, 'last_model.pth'))
            test_loss, test_acc = 0, 0
            #
            logger.append([
                epoch + 1, optimizer.param_groups[0]['lr'], train_loss,
                train_acc, test_loss, test_acc
            ])
            plot_feature(net,
                         trainloader,
                         device,
                         args.plotfolder,
                         epoch=epoch,
                         plot_class_num=args.train_class_num,
                         maximum=args.plot_max,
                         plot_quality=args.plot_quality)
            test(epoch, net, trainloader, testloader, criterion, device)

    test(99999, net, trainloader, testloader, criterion, device)
    plot_feature(net,
                 testloader,
                 device,
                 args.plotfolder,
                 epoch="test",
                 plot_class_num=args.train_class_num + 1,
                 maximum=args.plot_max,
                 plot_quality=args.plot_quality)
    logger.close()
示例#22
0
def main_stage2(net1, centroids):

    print(f"\n===> Start Stage-2 training...\n")
    start_epoch = 0  # start from epoch 0 or last checkpoint epoch
    # Ignore the classAwareSampler since we are not focusing on long-tailed problem.
    trainloader = torch.utils.data.DataLoader(trainset,
                                              batch_size=args.bs,
                                              shuffle=True,
                                              num_workers=4)
    print('==> Building model..')
    net2 = Network(backbone=args.arch,
                   embed_dim=512,
                   num_classes=args.train_class_num,
                   use_fc=True,
                   attmodule=True,
                   classifier='metaembedding',
                   backbone_fc=False,
                   data_shape=4)
    net2 = net2.to(device)
    if not args.evaluate:
        init_stage2_model(net1, net2)

    criterion = nn.CrossEntropyLoss()
    fea_criterion = DiscCentroidsLoss(args.train_class_num,
                                      args.stage1_feature_dim)
    fea_criterion = fea_criterion.to(device)
    optimizer = optim.SGD(net2.parameters(),
                          lr=args.lr,
                          momentum=0.9,
                          weight_decay=5e-4)
    optimizer_criterion = optim.SGD(fea_criterion.parameters(),
                                    lr=args.lr * 0.1,
                                    momentum=0.9,
                                    weight_decay=5e-4)

    # passing centroids data.
    if not args.evaluate:
        pass_centroids(net2, fea_criterion, init_centroids=centroids)

    if device == 'cuda':
        net2 = torch.nn.DataParallel(net2)
        cudnn.benchmark = True

    if args.stage2_resume:
        # Load checkpoint.
        if os.path.isfile(args.stage2_resume):
            print('==> Resuming from checkpoint..')
            checkpoint = torch.load(args.stage2_resume)
            net2.load_state_dict(checkpoint['net'])
            # best_acc = checkpoint['acc']
            # print("BEST_ACCURACY: "+str(best_acc))
            start_epoch = checkpoint['epoch']
            logger = Logger(os.path.join(args.checkpoint, 'log_stage2.txt'),
                            resume=True)
        else:
            print("=> no checkpoint found at '{}'".format(args.resume))
    else:
        logger = Logger(os.path.join(args.checkpoint, 'log_stage2.txt'))
        logger.set_names(
            ['Epoch', 'Learning Rate', 'Train Loss', 'Train Acc.'])

    if not args.evaluate:
        for epoch in range(start_epoch, args.stage2_es):
            print('\nStage_2 Epoch: %d   Learning rate: %f' %
                  (epoch + 1, optimizer.param_groups[0]['lr']))
            # Here, I didn't set optimizers respectively, just for simplicity. Performance did not vary a lot.
            adjust_learning_rate(optimizer, epoch, args.lr, step=20)
            train_loss, train_acc = stage2_train(net2, trainloader, optimizer,
                                                 optimizer_criterion,
                                                 criterion, fea_criterion,
                                                 device)
            save_model(net2, None, epoch,
                       os.path.join(args.checkpoint, 'stage_2_last_model.pth'))
            logger.append([
                epoch + 1, optimizer.param_groups[0]['lr'], train_loss,
                train_acc
            ])
            pass_centroids(net2, fea_criterion, init_centroids=None)
            if epoch % 5 == 0:
                test(net2, testloader, device)
        print(f"\nFinish Stage-2 training...\n")
    logger.close()

    test(net2, testloader, device)
    return net2
示例#23
0
def main_stage2(stage1_dict):
    net1 = stage1_dict["net"]
    thresholds = stage1_dict["distance"]["thresholds"]

    print(f"\n===> Start Stage-2 training...\n")
    start_epoch = 0
    print('==> Building model..')
    # net2 = DFPNet(backbone=args.arch, num_classes=args.train_class_num, embed_dim=args.embed_dim,
    #              distance=args.distance, scaled=args.scaled, cosine_weight=args.cosine_weight,thresholds=thresholds)
    net2 = DFPNet(backbone=args.arch, num_classes=args.train_class_num, embed_dim=args.embed_dim,
                  distance=args.distance, scaled=args.scaled, cosine_weight=args.cosine_weight, thresholds=thresholds)
    net2 = net2.to(device)

    criterion_dis = DFPLossGeneral(beta=args.beta, sigma=args.sigma,gamma=args.gamma)
    optimizer = optim.SGD(net2.parameters(), lr=args.stage2_lr, momentum=0.9, weight_decay=5e-4)

    if not args.evaluate:
        init_stage2_model(net1, net2)
    if device == 'cuda':
        net2 = torch.nn.DataParallel(net2)
        cudnn.benchmark = True

    if args.stage2_resume:
        # Load checkpoint.
        if os.path.isfile(args.stage2_resume):
            print('==> Resuming from checkpoint..')
            checkpoint = torch.load(args.stage1_resume)
            net2.load_state_dict(checkpoint['net'])
            start_epoch = checkpoint['epoch']
            logger = Logger(os.path.join(args.checkpoint, 'log_stage2.txt'), resume=True)
        else:
            print("=> no checkpoint found at '{}'".format(args.resume))
    else:
        logger = Logger(os.path.join(args.checkpoint, 'log_stage2.txt'))
        logger.set_names(['Epoch', 'Train Loss', 'Within Loss', 'Between Loss', 'Within-Gen Loss', 'Between-Gen Loss',
                          'Random loss', 'Train Acc.'])

    if not args.evaluate:
        for epoch in range(start_epoch, args.stage2_es):
            adjust_learning_rate(optimizer, epoch, args.stage2_lr, step=20)
            print('\nStage_2 Epoch: %d | Learning rate: %f ' % (epoch + 1, optimizer.param_groups[0]['lr']))
            train_out = stage2_train(net2, trainloader, optimizer, criterion_dis, device)
            save_model(net2, epoch, os.path.join(args.checkpoint, 'stage_2_last_model.pth'))
            # ['Epoch', 'Train Loss', 'Softmax Loss', 'Distance Loss',
            # 'Within Loss', 'Between Loss','Cen2cen loss', 'Train Acc.']
            logger.append([epoch + 1, train_out["dis_loss_total"], train_out["dis_loss_within"],
                           train_out["dis_loss_between"],train_out["dis_loss_within_gen"],
                           train_out["dis_loss_between_gen"], train_out["dis_loss_cen2cen"], train_out["accuracy"]])
            if args.plot:
                plot_feature(net2, trainloader, device, args.plotfolder2, epoch=epoch,
                             plot_class_num=args.train_class_num, maximum=args.plot_max, plot_quality=args.plot_quality)
    if args.plot:
        # plot the test set
        plot_feature(net2, testloader, device, args.plotfolder2, epoch="test",
                     plot_class_num=args.train_class_num + 1, maximum=args.plot_max, plot_quality=args.plot_quality)

    # calculating distances for last epoch
    # distance_results = plot_distance(net2, trainloader, device, args)

    logger.close()
    print(f"\nFinish Stage-2 training...\n")
    print("===> Evaluating ...")
    stage1_test(net2, testloader, device)
    return net2
示例#24
0
def train(train_loader, model, optimizer, epoch, logger, load_triplets=False):

    # switch to train mode
    model.train()
    pbar = tqdm(enumerate(train_loader))
    for batch_idx, data in pbar:
        if load_triplets:
            data_a, data_p, data_n = data
        else:
            data_a, data_p = data

        if args.cuda:
            data_a, data_p = data_a.cuda(), data_p.cuda()
            data_a, data_p = Variable(data_a), Variable(data_p)
            out_a, out_p = model(data_a), model(data_p)

        # load_triplets=Flase for the L2Net and HardNet, these two generate the positive patch based on the batch data
        if load_triplets:
            data_n = data_n.cuda()
            data_n = Variable(data_n)
            out_n = model(data_n)

        # for the comparision with L2Net, and random_global
        if args.batch_reduce == 'L2Net':
            loss = loss_L2Net(out_a,
                              out_p,
                              anchor_swap=args.anchorswap,
                              margin=args.margin,
                              loss_type=args.loss)
        elif args.batch_reduce == 'random_global':
            # using the random nagative patch samples from the dataset
            loss = loss_random_sampling(out_a,
                                        out_p,
                                        out_n,
                                        margin=args.margin,
                                        anchor_swap=args.anchorswap,
                                        loss_type=args.loss)
        else:
            loss = loss_HardNet(out_a,
                                out_p,
                                margin=args.margin,
                                anchor_swap=args.anchorswap,
                                anchor_ave=args.anchorave,
                                batch_reduce=args.batch_reduce,
                                loss_type=args.loss)

        # E2 loss in L2Net for descriptor componet correlation
        if args.decor:
            loss += CorrelationPenaltyLoss()(out_a)

        # gor for HardNet
        if args.gor:
            loss += args.alpha * global_orthogonal_regularization(out_a, out_n)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        adjust_learning_rate(optimizer, args)
        if batch_idx % args.log_interval == 0:
            pbar.set_description(
                'Train Epoch: {} [{}/{} ({:.0f}%)]  Loss: {:.6f}'.format(
                    epoch, batch_idx * len(data_a), len(train_loader.dataset),
                    100. * batch_idx / len(train_loader), loss.data[0]))
            if (args.enable_logging):
                logger.log_string(
                    'logs',
                    'Train Epoch: {} [{}/{} ({:.0f}%)] Loss: {:.6f}'.format(
                        epoch, batch_idx * len(data_a),
                        len(train_loader.dataset),
                        100. * batch_idx / len(train_loader), loss.data[0]))

    try:
        os.stat('{}{}'.format(args.model_dir, suffix))
    except:
        os.makedirs('{}{}'.format(args.model_dir, suffix))

    torch.save({
        'epoch': epoch + 1,
        'state_dict': model.state_dict()
    }, '{}{}/checkpoint_{}.pth'.format(args.model_dir, suffix, epoch))