示例#1
0
def main_stage1():
    print(f"\nStart Stage-1 training...\n")
    start_epoch = 0  # start from epoch 0 or last checkpoint epoch
    # data loader

    # Model
    print('==> Building model..')
    net = Network(backbone=args.arch,
                  embed_dim=512,
                  num_classes=args.train_class_num,
                  use_fc=False,
                  attmodule=False,
                  classifier='dotproduct',
                  backbone_fc=False,
                  data_shape=4)
    # net = models.__dict__[args.arch](num_classes=args.train_class_num) # CIFAR 100
    net = net.to(device)

    if device == 'cuda':
        net = torch.nn.DataParallel(net)
        cudnn.benchmark = True

    if args.stage1_resume:
        # Load checkpoint.
        if os.path.isfile(args.stage1_resume):
            print('==> Resuming from checkpoint..')
            checkpoint = torch.load(args.stage1_resume)
            net.load_state_dict(checkpoint['net'])
            # best_acc = checkpoint['acc']
            # print("BEST_ACCURACY: "+str(best_acc))
            start_epoch = checkpoint['epoch']
            logger = Logger(os.path.join(args.checkpoint, 'log_stage1.txt'),
                            resume=True)
        else:
            print("=> no checkpoint found at '{}'".format(args.resume))
    else:
        logger = Logger(os.path.join(args.checkpoint, 'log_stage1.txt'))
        logger.set_names(
            ['Epoch', 'Learning Rate', 'Train Loss', 'Train Acc.'])

    criterion = nn.CrossEntropyLoss()
    optimizer = optim.SGD(net.parameters(),
                          lr=args.lr,
                          momentum=0.9,
                          weight_decay=5e-4)

    for epoch in range(start_epoch, args.stage1_es):
        print('\nStage_1 Epoch: %d   Learning rate: %f' %
              (epoch + 1, optimizer.param_groups[0]['lr']))
        adjust_learning_rate(optimizer, epoch, args.lr, step=10)
        train_loss, train_acc = stage1_train(net, trainloader, optimizer,
                                             criterion, device)
        save_model(net, None, epoch,
                   os.path.join(args.checkpoint, 'stage_1_last_model.pth'))
        logger.append([
            epoch + 1, optimizer.param_groups[0]['lr'], train_loss, train_acc
        ])
    logger.close()
    print(f"\nFinish Stage-1 training...\n")
    return net
示例#2
0
def main_stage1():
    print(f"\nStart Stage-1 training ...\n")
    # for  initializing backbone, two branches, and centroids.
    start_epoch = 0  # start from epoch 0 or last checkpoint epoch

    # Model
    print('==> Building model..')
    net = DFPNet(backbone=args.arch, num_classes=args.train_class_num, embed_dim=args.embed_dim,
                 distance=args.distance, scaled=args.scaled)
    criterion = DFPLoss(alpha=args.alpha, beta=args.beta)
    optimizer = optim.SGD(net.parameters(), lr=args.stage1_lr, momentum=0.9, weight_decay=5e-4)

    net = net.to(device)
    if device == 'cuda':
        net = torch.nn.DataParallel(net)
        cudnn.benchmark = True

    if args.stage1_resume:
        # Load checkpoint.
        if os.path.isfile(args.stage1_resume):
            print('==> Resuming from checkpoint..')
            checkpoint = torch.load(args.stage1_resume)
            net.load_state_dict(checkpoint['net'])
            start_epoch = checkpoint['epoch']
            logger = Logger(os.path.join(args.checkpoint, 'log_stage1.txt'), resume=True)
        else:
            print("=> no checkpoint found at '{}'".format(args.resume))
    else:
        logger = Logger(os.path.join(args.checkpoint, 'log_stage1.txt'))
        logger.set_names(['Epoch', 'Train Loss', 'Softmax Loss', 'Within Loss', 'Between Loss', 'Train Acc.'])

    if not args.evaluate:
        for epoch in range(start_epoch, args.stage1_es):
            adjust_learning_rate(optimizer, epoch, args.stage1_lr, step=15)
            print('\nStage_1 Epoch: %d | Learning rate: %f ' % (epoch + 1, optimizer.param_groups[0]['lr']))
            train_out = stage1_train(net, trainloader, optimizer, criterion, device)
            save_model(net, epoch, os.path.join(args.checkpoint,'stage_1_last_model.pth'))
            logger.append([epoch + 1, train_out["train_loss"], train_out["cls_loss"], train_out["dis_loss_within"],
                           train_out["dis_loss_between"], train_out["accuracy"]])
            if args.plot:
                plot_feature(net, trainloader, device, args.plotfolder1, epoch=epoch,
                             plot_class_num=args.train_class_num, maximum=args.plot_max,
                             plot_quality=args.plot_quality,normalized=args.plot_normalized)
    if args.plot:
        # plot the test set
        plot_feature(net, testloader, device, args.plotfolder1, epoch="test",
                     plot_class_num=args.train_class_num + 1, maximum=args.plot_max,
                     plot_quality=args.plot_quality, normalized=args.plot_normalized)

    # calculating distances for last epoch
    distance_results = plot_distance(net, trainloader, device, args)

    logger.close()
    print(f"\nFinish Stage-1 training...\n")
    print("===> Evaluating ...")
    stage1_test(net, testloader, device)

    return {"net": net,
            "distance": distance_results
            }
def main_stage2(net, mid_energy):
    print("Starting stage-2 fine-tuning ...")
    if args.stage2_resume:
        # Load checkpoint.
        if os.path.isfile(args.stage1_resume):
            print('==> Resuming from checkpoint..')
            checkpoint = torch.load(args.stage1_resume)
            net.load_state_dict(checkpoint['net'])
            start_epoch = checkpoint['epoch']
            logger = Logger(os.path.join(args.checkpoint, 'log_stage1.txt'),
                            resume=True)
        else:
            print("=> no checkpoint found at '{}'".format(args.resume))
    else:
        logger = Logger(os.path.join(args.checkpoint, 'log_stage1.txt'))
        logger.set_names(['Epoch', 'Train Loss', 'Train Acc.'])

    # after resume
    criterion = DFPLoss(temperature=args.temperature)
    optimizer = torch.optim.SGD(net.parameters(),
                                lr=args.stage1_lr,
                                momentum=0.9,
                                weight_decay=5e-4)
    if not args.evaluate:
        for epoch in range(start_epoch, args.stage1_es):
            adjust_learning_rate(optimizer,
                                 epoch,
                                 args.stage1_lr,
                                 factor=args.stage1_lr_factor,
                                 step=args.stage1_lr_step)
            print('\nStage_1 Epoch: %d | Learning rate: %f ' %
                  (epoch + 1, optimizer.param_groups[0]['lr']))
            train_out = stage1_train(net, trainloader, optimizer, criterion,
                                     device)
            save_model(net, epoch,
                       os.path.join(args.checkpoint, 'stage_1_last_model.pth'))
            logger.append(
                [epoch + 1, train_out["train_loss"], train_out["accuracy"]])
            if args.plot:
                plot_feature(net,
                             args,
                             trainloader,
                             device,
                             args.plotfolder,
                             epoch=epoch,
                             plot_class_num=args.train_class_num,
                             plot_quality=args.plot_quality)
                plot_feature(net,
                             args,
                             testloader,
                             device,
                             args.plotfolder,
                             epoch="test" + str(epoch),
                             plot_class_num=args.train_class_num + 1,
                             plot_quality=args.plot_quality,
                             testmode=True)
        logger.close()
        print(f"\nFinish Stage-1 training...\n")
def main():
    print(f"\nStart  training ...\n")
    start_epoch = 0  # start from epoch 0 or last checkpoint epoch
    print('==> Building model..')
    net = BuildNet(backbone=args.arch,
                   num_classes=args.train_class_num,
                   embed_dim=args.embed_dim)
    net = net.to(device)
    if device == 'cuda':
        net = torch.nn.DataParallel(net)
        cudnn.benchmark = True

    optimizer = torch.optim.SGD(net.parameters(),
                                lr=args.lr,
                                momentum=0.9,
                                weight_decay=5e-4)

    if args.resume:
        # Load checkpoint.
        if os.path.isfile(args.resume):
            print('==> Resuming from checkpoint..')
            checkpoint = torch.load(args.resume)
            net.load_state_dict(checkpoint['net'])
            optimizer.load_state_dict(checkpoint['optimizer'])
            start_epoch = checkpoint['epoch']
            logger = Logger(os.path.join(args.checkpoint, 'log.txt'),
                            resume=True)
        else:
            print("=> no checkpoint found at '{}'".format(args.resume))
    else:
        logger = Logger(os.path.join(args.checkpoint, 'log.txt'))
        logger.set_names(
            ['Epoch', 'Train Loss', 'Train Acc.', "Test F1", 'threshold'])

    if not args.evaluate:
        for epoch in range(start_epoch, args.es):
            adjust_learning_rate(optimizer,
                                 epoch,
                                 args.lr,
                                 factor=args.lr_factor,
                                 step=args.lr_step)
            print('\nEpoch: %d | Learning rate: %f ' %
                  (epoch + 1, optimizer.param_groups[0]['lr']))
            train_out = train(net, trainloader, optimizer, criterion, device)
            save_model(net, optimizer, epoch,
                       os.path.join(args.checkpoint, 'last_model.pth'))
            test_out = test(net, testloader, criterion, device)
            logger.append([
                epoch + 1, train_out["train_loss"], train_out["accuracy"],
                test_out["best_F1"], test_out["best_thres"]
            ])
        logger.close()
        print(f"\nFinish training...\n")

    else:
        print("===> Evaluating ...")
        test(net, testloader, criterion, device)
def main_stage1():
    print(f"\nStart Stage-1 training ...\n")
    start_epoch = 0  # start from epoch 0 or last checkpoint epoch
    print('==> Building model..')
    net = DFPNet(backbone=args.arch, num_classes=args.train_class_num, embed_dim=args.embed_dim, p=args.p)
    net = net.to(device)
    if device == 'cuda':
        net = torch.nn.DataParallel(net)
        cudnn.benchmark = True

    criterion = DFPLoss(temperature=args.temperature)
    optimizer = torch.optim.SGD(net.parameters(), lr=args.stage1_lr, momentum=0.9, weight_decay=5e-4)

    if args.stage1_resume:
        # Load checkpoint.
        if os.path.isfile(args.stage1_resume):
            print('==> Resuming from checkpoint..')
            checkpoint = torch.load(args.stage1_resume)
            net.load_state_dict(checkpoint['net'])
            optimizer.load_state_dict(checkpoint['optimizer'])
            start_epoch = checkpoint['epoch']
            logger = Logger(os.path.join(args.checkpoint, 'log_stage1.txt'), resume=True)
        else:
            print("=> no checkpoint found at '{}'".format(args.resume))
    else:
        logger = Logger(os.path.join(args.checkpoint, 'log_stage1.txt'))
        logger.set_names(['Epoch', 'Train Loss', 'Train Acc.'])

    if not args.evaluate:
        for epoch in range(start_epoch, args.stage1_es):
            adjust_learning_rate(optimizer, epoch, args.stage1_lr,
                                 factor=args.stage1_lr_factor, step=args.stage1_lr_step)
            print('\nStage_1 Epoch: %d | Learning rate: %f ' % (epoch + 1, optimizer.param_groups[0]['lr']))
            train_out = stage1_train(net, trainloader, optimizer, criterion, device)
            save_model(net, optimizer, epoch, os.path.join(args.checkpoint, 'stage_1_last_model.pth'))
            logger.append([epoch + 1, train_out["train_loss"], train_out["accuracy"]])
            if args.plot:
                plot_feature(net, args, trainloader, device, args.plotfolder, epoch=epoch,
                             plot_class_num=args.train_class_num, plot_quality=args.plot_quality)
                plot_feature(net, args, testloader, device, args.plotfolder, epoch="test" + str(epoch),
                             plot_class_num=args.train_class_num + 1, plot_quality=args.plot_quality, testmode=True)
        logger.close()
        print(f"\nFinish Stage-1 training...\n")

    print("===> Evaluating stage-1 ...")
    stage_test(net, testloader, device)
    mid_dict = stage_valmixup(net, trainloader, device)
    print("===> stage1 energy based classification")
    stage_evaluate(net, testloader, mid_dict["mid_unknown"].item(), mid_dict["mid_known"].item(), feature="energy")
    print("===> stage1 softmax based classification")
    stage_evaluate(net, testloader, 0., 1., feature="normweight_fea2cen")
    return {
        "net": net.state_dict(),
        "mid_known": mid_dict["mid_known"],
        "mid_unknown": mid_dict["mid_unknown"]
    }
def main_stage1():
    print(f"\nStart Stage-1 training ...\n")
    # for  initializing backbone, two branches, and centroids.
    start_epoch = 0  # start from epoch 0 or last checkpoint epoch

    # Model
    print('==> Building model..')
    net = DFPNet(backbone=args.arch, num_classes=args.train_class_num, embed_dim=args.embed_dim)

    net = net.to(device)
    if device == 'cuda':
        net = torch.nn.DataParallel(net)
        cudnn.benchmark = True

    if args.stage1_resume:
        # Load checkpoint.
        if os.path.isfile(args.stage1_resume):
            print('==> Resuming from checkpoint..')
            checkpoint = torch.load(args.stage1_resume)
            net.load_state_dict(checkpoint['net'])
            start_epoch = checkpoint['epoch']
            logger = Logger(os.path.join(args.checkpoint, 'log_stage1.txt'), resume=True)
        else:
            print("=> no checkpoint found at '{}'".format(args.resume))
    else:
        logger = Logger(os.path.join(args.checkpoint, 'log_stage1.txt'))
        logger.set_names(['Epoch', 'Train Loss', 'Train Acc.'])

    # after resume
    criterion = DFPLoss(scaling=args.scaling)
    optimizer = optim.Adam(net.parameters(), lr=args.stage1_lr)

    for epoch in range(start_epoch, args.stage1_es):
        adjust_learning_rate(optimizer, epoch, args.stage1_lr, factor=0.2, step=20)
        print('\nStage_1 Epoch: %d | Learning rate: %f ' % (epoch + 1, optimizer.param_groups[0]['lr']))
        train_out = stage1_train(net, trainloader, optimizer, criterion, device)
        save_model(net, epoch, os.path.join(args.checkpoint, 'stage_1_last_model.pth'))
        logger.append([epoch + 1, train_out["train_loss"], train_out["accuracy"]])
        if args.plot:
            plot_feature(net, args, trainloader, device, args.plotfolder, epoch=epoch,
                         plot_class_num=args.train_class_num, plot_quality=args.plot_quality)
            plot_feature(net, args, testloader, device, args.plotfolder, epoch="test" + str(epoch),
                         plot_class_num=args.train_class_num + 1, plot_quality=args.plot_quality, testmode=True)

    logger.close()
    print(f"\nFinish Stage-1 training...\n")
    print("===> Evaluating ...")
    stage1_test(net, testloader, device)

    return {
        "net": net
    }
示例#7
0
def main_stage1():
    print(f"\nStart Stage-1 training ...\n")
    # for  initializing backbone, two branches, and centroids.
    start_epoch = 0  # start from epoch 0 or last checkpoint epoch

    # Model
    print('==> Building model..')
    net = DFPNet(backbone=args.arch, num_classes=args.train_class_num, embed_dim=args.embed_dim,
                 distance=args.distance, scaled=args.scaled, cosine_weight=args.cosine_weight)
    # embed_dim = net.feat_dim if not args.embed_dim else args.embed_dim
    # criterion_cls = nn.CrossEntropyLoss()
    criterion_dis = DFPLoss(beta=args.beta, sigma=args.sigma)
    optimizer = optim.SGD(net.parameters(), lr=args.stage1_lr, momentum=0.9, weight_decay=5e-4)

    net = net.to(device)
    if device == 'cuda':
        net = torch.nn.DataParallel(net)
        cudnn.benchmark = True

    if args.stage1_resume:
        # Load checkpoint.
        if os.path.isfile(args.stage1_resume):
            print('==> Resuming from checkpoint..')
            checkpoint = torch.load(args.stage1_resume)
            net.load_state_dict(checkpoint['net'])
            start_epoch = checkpoint['epoch']
            logger = Logger(os.path.join(args.checkpoint, 'log_stage1.txt'), resume=True)
        else:
            print("=> no checkpoint found at '{}'".format(args.resume))
    else:
        logger = Logger(os.path.join(args.checkpoint, 'log_stage1.txt'))
        logger.set_names(['Epoch', 'Train Loss', 'Softmax Loss', 'Distance Loss',
                          'Within Loss', 'Between Loss', 'Cen2cen Loss', 'Train Acc.'])

    for epoch in range(start_epoch, start_epoch + args.stage1_es):
        print('\nStage_1 Epoch: %d | Learning rate: %f ' % (epoch + 1, optimizer.param_groups[0]['lr']))
        adjust_learning_rate(optimizer, epoch, args.stage1_lr, step=15)
        train_out = stage1_train(net, trainloader, optimizer, criterion_dis, device)
        save_model(net, epoch, os.path.join(args.checkpoint,'stage_1_last_model.pth'))
        # ['Epoch', 'Train Loss', 'Softmax Loss', 'Distance Loss',
        # 'Within Loss', 'Between Loss','Cen2cen loss', 'Train Acc.']
        logger.append([epoch + 1, train_out["train_loss"], 0.0,
                       train_out["dis_loss_total"], train_out["dis_loss_within"],
                       train_out["dis_loss_between"], train_out["dis_loss_cen2cen"], train_out["accuracy"]])
        if args.plot:
            plot_feature(net, trainloader, device, args.plotfolder, epoch=epoch,
                         plot_class_num=args.train_class_num, maximum=args.plot_max,plot_quality=args.plot_quality)
    logger.close()
    print(f"\nFinish Stage-1 training...\n")
    return net
def main_stage2(net, mid_known, mid_unknown):
    print("Starting stage-2 fine-tuning ...")
    start_epoch = 0
    criterion = FinetuneLoss(mid_known=mid_known, mid_unknown=mid_unknown,
                            gamma=args.gamma, temperature=args.temperature, feature='energy')
    criterion = criterion.to(device)
    optimizer = torch.optim.SGD(net.parameters(), lr=args.stage2_lr, momentum=0.9, weight_decay=5e-4)
    if args.stage2_resume:
        # Load checkpoint.
        if os.path.isfile(args.stage2_resume):
            print('==> Resuming from checkpoint..')
            checkpoint = torch.load(args.stage2_resume)
            net.load_state_dict(checkpoint['net'])
            optimizer.load_state_dict(checkpoint['optimizer'])
            start_epoch = checkpoint['epoch']
            logger = Logger(os.path.join(args.checkpoint, 'log_stage2.txt'), resume=True)
        else:
            print("=> no checkpoint found at '{}'".format(args.stage2_resume))
    else:
        logger = Logger(os.path.join(args.checkpoint, 'log_stage2.txt'))
        logger.set_names(['Epoch', 'Train Loss', 'Class Loss', 'Energy Loss',
                          'Energy Known', 'Energy Unknown', 'Train Acc.', "Test F1"])

    if not args.evaluate:
        best_F1_list = []
        for epoch in range(start_epoch, args.stage2_es):
            adjust_learning_rate(optimizer, epoch, args.stage2_lr,
                                 factor=args.stage2_lr_factor, step=args.stage2_lr_step)
            print('\nStage_2 Epoch: %d | Learning rate: %f ' % (epoch + 1, optimizer.param_groups[0]['lr']))
            train_out = stage2_train(net, trainloader, optimizer, criterion, device)

            save_model(net, optimizer, epoch, os.path.join(args.checkpoint, 'stage_2_last_model.pth'))
            # test_out = test_with_hist(net, testloader, device, name=f"stage2_test{epoch}")
            test_out = test(net, testloader, device)
            # stage_valmixup(net, trainloader, device, name=f"stage2_mixup{epoch}")
            logger.append([epoch + 1, train_out["train_loss"], train_out["loss_classification"],
                           train_out["loss_energy"], train_out["loss_energy_known"],
                           train_out["loss_energy_unknown"], train_out["accuracy"],
                           test_out["best_F1"]
                           ])
            best_F1_list.append(test_out["best_F1"])
        logger.close()
        print(f"\nFinish Stage-2 training...\n")
        last_five = np.array(best_F1_list[-5:])
        print(f"\nGamma:{args.gamma} | F1_mean: {last_five.mean()} | F1_std: {last_five.std()}")
示例#9
0
def main_stage1():
    print(f"\nStart Stage-1 training ...\n")
    # for  initializing backbone, two branches, and centroids.
    start_epoch = 0  # start from epoch 0 or last checkpoint epoch

    # Model
    print('==> Building model..')
    net = DFPNet(backbone=args.arch,
                 num_classes=args.train_class_num,
                 embed_dim=args.embed_dim,
                 distance=args.distance,
                 similarity=args.similarity,
                 scaled=args.scaled,
                 norm_centroid=args.norm_centroid,
                 decorrelation=args.decorrelation)

    net = net.to(device)
    if device == 'cuda':
        net = torch.nn.DataParallel(net)
        cudnn.benchmark = True

    if args.stage1_resume:
        # Load checkpoint.
        if os.path.isfile(args.stage1_resume):
            print('==> Resuming from checkpoint..')
            checkpoint = torch.load(args.stage1_resume)
            net.load_state_dict(checkpoint['net'])
            start_epoch = checkpoint['epoch']
            logger = Logger(os.path.join(args.checkpoint, 'log_stage1.txt'),
                            resume=True)
        else:
            print("=> no checkpoint found at '{}'".format(args.resume))
    else:
        logger = Logger(os.path.join(args.checkpoint, 'log_stage1.txt'))
        logger.set_names([
            'Epoch', 'Train Loss', 'Similarity Loss', 'Distance Loss',
            'Train Acc.'
        ])

    # after resume
    criterion = DFPLoss(alpha=args.alpha)
    optimizer = optim.SGD(net.parameters(),
                          lr=args.stage1_lr,
                          momentum=0.9,
                          weight_decay=5e-4)

    for epoch in range(start_epoch, args.stage1_es):
        adjust_learning_rate(optimizer, epoch, args.stage1_lr, step=20)
        print('\nStage_1 Epoch: %d | Learning rate: %f ' %
              (epoch + 1, optimizer.param_groups[0]['lr']))
        train_out = stage1_train(net, trainloader, optimizer, criterion,
                                 device)
        save_model(net, epoch,
                   os.path.join(args.checkpoint, 'stage_1_last_model.pth'))
        logger.append([
            epoch + 1, train_out["train_loss"], train_out["loss_similarity"],
            train_out["loss_distance"], train_out["accuracy"]
        ])

    # calculating distances for last epoch
    distance_results = plot_distance(net, trainloader, device, args)
    # print(f"the distance thresholds are\n {distance_results['thresholds']}\n")
    # gap_results = plot_gap(net, trainloader, device, args)
    # stat = get_gap_stat(net, trainloader, device, args)
    # estimator =CGD_estimator(gap_results)

    logger.close()
    print(f"\nFinish Stage-1 training...\n")
    print("===> Evaluating ...")
    stage1_test(net, testloader, device)

    return {
        "net": net,
        "distance": distance_results,
        # "stat": stat
    }
示例#10
0
def main():
    print(f"\nStart  training ...\n")
    start_epoch = 0  # start from epoch 0 or last checkpoint epoch
    print('==> Building model..')
    net = BuildNet(backbone=args.arch,
                   num_classes=args.train_class_num,
                   embed_dim=args.embed_dim)
    net = net.to(device)
    if device == 'cuda':
        net = torch.nn.DataParallel(net)
        cudnn.benchmark = True

    optimizer = torch.optim.SGD(net.parameters(),
                                lr=args.lr,
                                momentum=0.9,
                                weight_decay=5e-4)

    if args.resume:
        # Load checkpoint.
        if os.path.isfile(args.resume):
            print('==> Resuming from checkpoint..')
            checkpoint = torch.load(args.resume)
            net.load_state_dict(checkpoint['net'])
            optimizer.load_state_dict(checkpoint['optimizer'])
            start_epoch = checkpoint['epoch']
            loggerList = []
            for i in range(args.train_class_num, args.test_class_num + 1):
                loggerList.append(
                    Logger(os.path.join(args.checkpoint, f'log{i}.txt'),
                           resume=True))
        else:
            print("=> no checkpoint found at '{}'".format(args.resume))
    else:
        loggerList = []
        for i in range(args.train_class_num, args.test_class_num + 1):
            logger = Logger(os.path.join(args.checkpoint, f'log{i}.txt'))
            logger.set_names([
                'Epoch', 'Train Loss', 'Train Acc.', "Pos-F1", 'Norm-F1',
                'Energy-F1'
            ])
            loggerList.append(logger)

    if not args.evaluate:
        for epoch in range(start_epoch, args.es):
            adjust_learning_rate(optimizer,
                                 epoch,
                                 args.lr,
                                 factor=args.lr_factor,
                                 step=args.lr_step)
            print('\nEpoch: %d | Learning rate: %f ' %
                  (epoch + 1, optimizer.param_groups[0]['lr']))
            train_out = train(net, trainloader, optimizer, criterion, device)
            save_model(net, optimizer, epoch,
                       os.path.join(args.checkpoint, 'last_model.pth'))

            for test_class_num in range(args.train_class_num,
                                        args.test_class_num + 1):
                testset = CIFAR10(
                    root='../../data',
                    train=False,
                    download=True,
                    transform=transform_test,
                    train_class_num=args.train_class_num,
                    test_class_num=test_class_num,
                    includes_all_train_class=args.includes_all_train_class)
                testloader = torch.utils.data.DataLoader(testset,
                                                         batch_size=args.bs,
                                                         shuffle=False,
                                                         num_workers=4)
                test_out = test(net, testloader, criterion, device)
                logger = loggerList[test_class_num - args.train_class_num]
                logger.append([
                    epoch + 1, train_out["train_loss"], train_out["accuracy"],
                    test_out["best_F1_possibility"], test_out["best_F1_norm"],
                    test_out["best_F1_energy"]
                ])
        logger.close()
        print(f"\nFinish training...\n")
def main_stage2(net, mid_known, mid_unknown):
    print("Starting stage-2 fine-tuning ...")
    start_epoch = 0
    criterion = DFPNormLoss(mid_known=1.3 * mid_known,
                            mid_unknown=0.7 * mid_unknown,
                            alpha=args.alpha,
                            temperature=args.temperature,
                            feature='energy')
    optimizer = torch.optim.SGD(net.parameters(),
                                lr=args.stage2_lr,
                                momentum=0.9,
                                weight_decay=5e-4)
    if args.stage2_resume:
        # Load checkpoint.
        if os.path.isfile(args.stage2_resume):
            print('==> Resuming from checkpoint..')
            checkpoint = torch.load(args.stage2_resume)
            net.load_state_dict(checkpoint['net'])
            optimizer.load_state_dict(checkpoint['optimizer'])
            start_epoch = checkpoint['epoch']
            logger = Logger(os.path.join(args.checkpoint, 'log_stage2.txt'),
                            resume=True)
        else:
            print("=> no checkpoint found at '{}'".format(args.resume))
    else:
        logger = Logger(os.path.join(args.checkpoint, 'log_stage2.txt'))
        logger.set_names([
            'Epoch', 'Train Loss', 'Class Loss', 'Energy Loss', 'Energy Known',
            'Energy Unknown', 'Train Acc.'
        ])

    if not args.evaluate:
        for epoch in range(start_epoch, args.stage2_es):
            adjust_learning_rate(optimizer,
                                 epoch,
                                 args.stage2_lr,
                                 factor=args.stage2_lr_factor,
                                 step=args.stage2_lr_step)
            print('\nStage_2 Epoch: %d | Learning rate: %f ' %
                  (epoch + 1, optimizer.param_groups[0]['lr']))
            train_out = stage2_train(net, trainloader, optimizer, criterion,
                                     device)
            save_model(net, optimizer, epoch,
                       os.path.join(args.checkpoint, 'stage_2_last_model.pth'))
            logger.append([
                epoch + 1, train_out["train_loss"],
                train_out["loss_classification"], train_out["loss_energy"],
                train_out["loss_energy_known"],
                train_out["loss_energy_unknown"], train_out["accuracy"]
            ])
            if args.plot:
                plot_feature(net,
                             args,
                             trainloader,
                             device,
                             args.plotfolder,
                             epoch="stage2_" + str(epoch),
                             plot_class_num=args.train_class_num,
                             plot_quality=args.plot_quality)
                plot_feature(net,
                             args,
                             testloader,
                             device,
                             args.plotfolder,
                             epoch="stage2_test" + str(epoch),
                             plot_class_num=args.train_class_num + 1,
                             plot_quality=args.plot_quality,
                             testmode=True)
        logger.close()
        print(f"\nFinish Stage-2 training...\n")

        print("===> Evaluating stage-2 ...")
        stage_test(net, testloader, device, name="stage2_test_doublebar")
        stage_valmixup(net, trainloader, device, name="stage2_mixup_result")
示例#12
0
            if cfg.TENSORBOARD_SAVE:
                writer.add_scalar('eval_mean_reward', eval_mean_reward, eval_i)
                writer.add_scalar('eval_profit', eval_total_profit, eval_i)
                writer.add_scalar('eval_acc_profit', eval_acc_profit, eval_i)
            # print("Validation episode", eval_i, "ended. Mean reward =", eval_mean_reward, "| Total profit =", eval_total_profit, "(Start date =", start_date, ")")
            # print("Validation episode {} ended. Mean reward = {} | Total profit = {}".format(eval_i, eval_mean_reward, eval_total_profit))
            logger.print_out(
                "Validation episode {} ended. Mean reward = {} | Total profit = {} | Acc profit = {}"
                .format(eval_i, eval_mean_reward, eval_total_profit,
                        eval_acc_profit))

            # print("Actions: {} (Start date = {})".format(actions, start_date))
            logger.print_out("Actions: {} (Start date = {})".format(
                actions, start_date))

            # rounded_obs = [round(x,2) for x in agent.env.obs.flatten()]
            # logger.print_out("obs: {}".format(rounded_obs))
            # logger.print_out("attention_probs: {}".format(agent.env.attention_probs))
            # logger.print_out("attention_obs: {}".format(agent.env.attention_obs))

            logger.print_out("")
            # print()

            eval_i += 1
        # End eval

    logger.close()
    os.system('clear')
    agent.clear_memory()
示例#13
0
def main_stage2(net1, centroids):

    print(f"\n===> Start Stage-2 training...\n")
    start_epoch = 0  # start from epoch 0 or last checkpoint epoch
    # Ignore the classAwareSampler since we are not focusing on long-tailed problem.
    trainloader = torch.utils.data.DataLoader(trainset,
                                              batch_size=args.bs,
                                              shuffle=True,
                                              num_workers=4)
    print('==> Building model..')
    net2 = Network(backbone=args.arch,
                   embed_dim=512,
                   num_classes=args.train_class_num,
                   use_fc=True,
                   attmodule=True,
                   classifier='metaembedding',
                   backbone_fc=False,
                   data_shape=4)
    net2 = net2.to(device)
    if not args.evaluate:
        init_stage2_model(net1, net2)

    criterion = nn.CrossEntropyLoss()
    fea_criterion = DiscCentroidsLoss(args.train_class_num,
                                      args.stage1_feature_dim)
    fea_criterion = fea_criterion.to(device)
    optimizer = optim.SGD(net2.parameters(),
                          lr=args.lr,
                          momentum=0.9,
                          weight_decay=5e-4)
    optimizer_criterion = optim.SGD(fea_criterion.parameters(),
                                    lr=args.lr * 0.1,
                                    momentum=0.9,
                                    weight_decay=5e-4)

    # passing centroids data.
    if not args.evaluate:
        pass_centroids(net2, fea_criterion, init_centroids=centroids)

    if device == 'cuda':
        net2 = torch.nn.DataParallel(net2)
        cudnn.benchmark = True

    if args.stage2_resume:
        # Load checkpoint.
        if os.path.isfile(args.stage2_resume):
            print('==> Resuming from checkpoint..')
            checkpoint = torch.load(args.stage2_resume)
            net2.load_state_dict(checkpoint['net'])
            # best_acc = checkpoint['acc']
            # print("BEST_ACCURACY: "+str(best_acc))
            start_epoch = checkpoint['epoch']
            logger = Logger(os.path.join(args.checkpoint, 'log_stage2.txt'),
                            resume=True)
        else:
            print("=> no checkpoint found at '{}'".format(args.resume))
    else:
        logger = Logger(os.path.join(args.checkpoint, 'log_stage2.txt'))
        logger.set_names(
            ['Epoch', 'Learning Rate', 'Train Loss', 'Train Acc.'])

    if not args.evaluate:
        for epoch in range(start_epoch, args.stage2_es):
            print('\nStage_2 Epoch: %d   Learning rate: %f' %
                  (epoch + 1, optimizer.param_groups[0]['lr']))
            # Here, I didn't set optimizers respectively, just for simplicity. Performance did not vary a lot.
            adjust_learning_rate(optimizer, epoch, args.lr, step=20)
            train_loss, train_acc = stage2_train(net2, trainloader, optimizer,
                                                 optimizer_criterion,
                                                 criterion, fea_criterion,
                                                 device)
            save_model(net2, None, epoch,
                       os.path.join(args.checkpoint, 'stage_2_last_model.pth'))
            logger.append([
                epoch + 1, optimizer.param_groups[0]['lr'], train_loss,
                train_acc
            ])
            pass_centroids(net2, fea_criterion, init_centroids=None)
            if epoch % 5 == 0:
                test(net2, testloader, device)
        print(f"\nFinish Stage-2 training...\n")
    logger.close()

    test(net2, testloader, device)
    return net2
def main():
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    print(device)
    best_acc = 0  # best test accuracy
    start_epoch = 0  # start from epoch 0 or last checkpoint epoch

    # checkpoint
    args.checkpoint = './checkpoints/mnist/' + args.arch
    if not os.path.isdir(args.checkpoint):
        mkdir_p(args.checkpoint)

    # folder to save figures
    args.plotfolder = './checkpoints/mnist/' + args.arch + '/plotter'
    if not os.path.isdir(args.plotfolder):
        mkdir_p(args.plotfolder)

    # Data
    print('==> Preparing data..')
    transform = transforms.Compose(
        [transforms.ToTensor(),
         transforms.Normalize((0.1307, ), (0.3081, ))])

    trainset = MNIST(root='../../data',
                     train=True,
                     download=True,
                     transform=transform,
                     train_class_num=args.train_class_num,
                     test_class_num=args.test_class_num,
                     includes_all_train_class=args.includes_all_train_class)
    testset = MNIST(root='../../data',
                    train=False,
                    download=True,
                    transform=transform,
                    train_class_num=args.train_class_num,
                    test_class_num=args.test_class_num,
                    includes_all_train_class=args.includes_all_train_class)
    # data loader
    trainloader = torch.utils.data.DataLoader(trainset,
                                              batch_size=args.bs,
                                              shuffle=True,
                                              num_workers=4)
    testloader = torch.utils.data.DataLoader(testset,
                                             batch_size=args.bs,
                                             shuffle=False,
                                             num_workers=4)

    # Model
    net = Network(backbone=args.arch,
                  num_classes=args.train_class_num,
                  embed_dim=args.embed_dim)
    fea_dim = net.classifier.in_features
    net = net.to(device)

    if device == 'cuda':
        net = torch.nn.DataParallel(net)
        cudnn.benchmark = True

    if args.resume:
        # Load checkpoint.
        if os.path.isfile(args.resume):
            print('==> Resuming from checkpoint..')
            checkpoint = torch.load(args.resume)
            net.load_state_dict(checkpoint['net'])
            # best_acc = checkpoint['acc']
            # print("BEST_ACCURACY: "+str(best_acc))
            start_epoch = checkpoint['epoch']
            logger = Logger(os.path.join(args.checkpoint, 'log.txt'),
                            resume=True)
        else:
            print("=> no checkpoint found at '{}'".format(args.resume))
    else:
        logger = Logger(os.path.join(args.checkpoint, 'log.txt'))
        logger.set_names([
            'Epoch', 'Learning Rate', 'Train Loss', 'Train Acc.', 'Test Loss',
            'Test Acc.'
        ])

    criterion = nn.CrossEntropyLoss()
    optimizer = optim.SGD(net.parameters(),
                          lr=args.lr,
                          momentum=0.9,
                          weight_decay=5e-4)

    # test(0, net, trainloader, testloader, criterion, device)
    epoch = 0
    if not args.evaluate:
        for epoch in range(start_epoch, args.es):
            print('\nEpoch: %d   Learning rate: %f' %
                  (epoch + 1, optimizer.param_groups[0]['lr']))
            adjust_learning_rate(optimizer, epoch, args.lr, step=20)
            train_loss, train_acc = train(net, trainloader, optimizer,
                                          criterion, device)
            save_model(net, None, epoch,
                       os.path.join(args.checkpoint, 'last_model.pth'))
            test_loss, test_acc = 0, 0
            #
            logger.append([
                epoch + 1, optimizer.param_groups[0]['lr'], train_loss,
                train_acc, test_loss, test_acc
            ])
            plot_feature(net,
                         trainloader,
                         device,
                         args.plotfolder,
                         epoch=epoch,
                         plot_class_num=args.train_class_num,
                         maximum=args.plot_max,
                         plot_quality=args.plot_quality)
            test(epoch, net, trainloader, testloader, criterion, device)

    test(99999, net, trainloader, testloader, criterion, device)
    plot_feature(net,
                 testloader,
                 device,
                 args.plotfolder,
                 epoch="test",
                 plot_class_num=args.train_class_num + 1,
                 maximum=args.plot_max,
                 plot_quality=args.plot_quality)
    logger.close()
def main_stage2(net, mid_known, mid_unknown):
    print("Starting stage-2 fine-tuning ...")
    start_epoch = 0
    criterion = FinetuneLoss(mid_known=mid_known,
                             mid_unknown=mid_unknown,
                             gamma=args.gamma,
                             temperature=args.temperature,
                             feature='energy')
    criterion = criterion.to(device)
    optimizer = torch.optim.SGD(net.parameters(),
                                lr=args.stage2_lr,
                                momentum=0.9,
                                weight_decay=5e-4)
    if args.stage2_resume:
        # Load checkpoint.
        if os.path.isfile(args.stage2_resume):
            print('==> Resuming from checkpoint..')
            checkpoint = torch.load(args.stage2_resume)
            net.load_state_dict(checkpoint['net'])
            optimizer.load_state_dict(checkpoint['optimizer'])
            start_epoch = checkpoint['epoch']
            loggerList = []
            for i in range(args.train_class_num, args.test_class_num + 1):
                loggerList.append(
                    Logger(os.path.join(args.checkpoint, f'log{i}_stage2.txt'),
                           resume=True))
        else:
            print("=> no checkpoint found at '{}'".format(args.stage2_resume))
    else:
        loggerList = []
        for i in range(args.train_class_num, args.test_class_num + 1):
            logger = Logger(os.path.join(args.checkpoint,
                                         f'log{i}_stage2.txt'))
            logger.set_names(
                ['Epoch', 'Train Loss', 'Train Acc.', 'Energy-F1'])
            loggerList.append(logger)

    if not args.evaluate:
        for epoch in range(start_epoch, args.stage2_es):
            adjust_learning_rate(optimizer,
                                 epoch,
                                 args.stage2_lr,
                                 factor=args.stage2_lr_factor,
                                 step=args.stage2_lr_step)
            print('\nStage_2 Epoch: %d | Learning rate: %f ' %
                  (epoch + 1, optimizer.param_groups[0]['lr']))
            train_out = stage2_train(net, trainloader, optimizer, criterion,
                                     device)
            save_model(net, optimizer, epoch,
                       os.path.join(args.checkpoint, 'stage_2_last_model.pth'))

            for test_class_num in range(args.train_class_num,
                                        args.test_class_num + 1):
                testset = CIFAR10(
                    root='../../data',
                    train=False,
                    download=True,
                    transform=transform_test,
                    train_class_num=args.train_class_num,
                    test_class_num=test_class_num,
                    includes_all_train_class=args.includes_all_train_class)
                testloader = torch.utils.data.DataLoader(
                    testset,
                    batch_size=args.stage2_bs,
                    shuffle=False,
                    num_workers=4)
                test_out = test(net, testloader, device)
                logger = loggerList[test_class_num - args.train_class_num]
                logger.append([
                    epoch + 1, train_out["train_loss"], train_out["accuracy"],
                    test_out["best_F1"]
                ])
        logger.close()
        print(f"\nFinish Stage-2 training...\n")
示例#16
0
def main_stage2(stage1_dict):
    print('==> Building stage2 model..')
    start_epoch = 0  # start from epoch 0 or last checkpoint epoch
    net = DFPNet(backbone=args.arch,
                 num_classes=args.train_class_num,
                 embed_dim=args.embed_dim,
                 distance=args.distance,
                 similarity=args.similarity,
                 scaled=args.scaled,
                 norm_centroid=args.norm_centroid,
                 decorrelation=args.decorrelation)
    net = net.to(device)
    if device == 'cuda':
        net = torch.nn.DataParallel(net)
        cudnn.benchmark = True

    if not args.evaluate and not os.path.isfile(args.stage2_resume):
        net = stage1_dict['net']
        net = net.to(device)
        thresholds = stage1_dict['distance']['thresholds']
        # stat = stage1_dict["stat"]
        net.module.set_threshold(thresholds.to(device))

    if args.stage2_resume:
        # Load checkpoint.
        if os.path.isfile(args.stage2_resume):
            print('==> Resuming from checkpoint..')
            checkpoint = torch.load(args.stage2_resume)
            net.load_state_dict(checkpoint['net'])
            start_epoch = checkpoint['epoch']
            try:
                thresholds = checkpoint['net']['thresholds']
            except:
                thresholds = checkpoint['net']['module.thresholds']
            net.module.set_threshold(thresholds.to(device))

            logger = Logger(os.path.join(args.checkpoint, 'log_stage2.txt'),
                            resume=True)
        else:
            print("=> no checkpoint found at '{}'".format(args.resume))
    else:
        logger = Logger(os.path.join(args.checkpoint, 'log_stage2.txt'))
        logger.set_names([
            'Epoch', 'Train Loss', 'Similarity Loss', 'Distance in',
            'Distance out', 'Distance Center', 'Train Acc.'
        ])

    if args.evaluate:
        stage2_test(net, testloader, device)
        return net

    # after resume
    criterion = DFPLoss2(alpha=args.alpha, beta=args.beta, theta=args.theta)
    optimizer = optim.SGD(net.parameters(),
                          lr=args.stage1_lr,
                          momentum=0.9,
                          weight_decay=5e-4)

    for epoch in range(start_epoch, args.stage2_es):
        print('\nStage_2 Epoch: %d   Learning rate: %f' %
              (epoch + 1, optimizer.param_groups[0]['lr']))
        # Here, I didn't set optimizers respectively, just for simplicity. Performance did not vary a lot.
        adjust_learning_rate(optimizer, epoch, args.stage2_lr, step=20)
        # if epoch %5 ==0:
        #     distance_results = plot_distance(net, trainloader, device, args)
        #     thresholds = distance_results['thresholds']
        #     net.module.set_threshold(thresholds.to(device))
        train_out = stage2_train(net, trainloader, optimizer, criterion,
                                 device)
        save_model(net, epoch,
                   os.path.join(args.checkpoint, 'stage_2_last_model.pth'))
        stage2_test(net, testloader, device)
        # stat = get_gap_stat(net2, trainloader, device, args)

        logger.append([
            epoch + 1, train_out["train_loss"], train_out["loss_similarity"],
            train_out["distance_in"], train_out["distance_out"],
            train_out["distance_center"], train_out["accuracy"]
        ])

    print(f"\nFinish Stage-2 training...\n")

    logger.close()
    stage2_test(net, testloader, device)
    return net
示例#17
0
def main():
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    print(device)
    best_acc = 0  # best test accuracy
    start_epoch = 0  # start from epoch 0 or last checkpoint epoch

    # checkpoint
    args.checkpoint = './checkpoints/cifar/' + args.arch
    if not os.path.isdir(args.checkpoint):
        mkdir_p(args.checkpoint)

    # Data
    print('==> Preparing data..')
    transform_train = transforms.Compose([
        transforms.RandomCrop(32, padding=4),
        transforms.RandomHorizontalFlip(),
        transforms.ToTensor(),
        transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
    ])

    transform_test = transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
    ])

    trainset = CIFAR100(root='../../data', train=True, download=True, transform=transform_train,
                        train_class_num=args.train_class_num, test_class_num=args.test_class_num,
                        includes_all_train_class=args.includes_all_train_class)
    trainloader = torch.utils.data.DataLoader(trainset, batch_size=args.bs, shuffle=True, num_workers=4)
    testset = CIFAR100(root='../../data', train=False, download=True, transform=transform_test,
                       train_class_num=args.train_class_num, test_class_num=args.test_class_num,
                       includes_all_train_class=args.includes_all_train_class)
    testloader = torch.utils.data.DataLoader(testset, batch_size=args.bs, shuffle=False, num_workers=4)


    # Model
    print('==> Building model..')
    net = models.__dict__[args.arch](num_classes=args.train_class_num) # CIFAR 100
    net = net.to(device)

    if device == 'cuda':
        net = torch.nn.DataParallel(net)
        cudnn.benchmark = True

    if args.resume:
        # Load checkpoint.
        if os.path.isfile(args.resume):
            print('==> Resuming from checkpoint..')
            checkpoint = torch.load(args.resume)
            net.load_state_dict(checkpoint['net'])
            # best_acc = checkpoint['acc']
            # print("BEST_ACCURACY: "+str(best_acc))
            start_epoch = checkpoint['epoch']
            logger = Logger(os.path.join(args.checkpoint, 'log.txt'), resume=True)
        else:
            print("=> no checkpoint found at '{}'".format(args.resume))
    else:
        logger = Logger(os.path.join(args.checkpoint, 'log.txt'))
        logger.set_names(['Epoch', 'Learning Rate', 'Train Loss','Train Acc.', 'Test Loss', 'Test Acc.'])

    criterion = nn.CrossEntropyLoss()
    optimizer = optim.SGD(net.parameters(), lr=args.lr, momentum=0.9, weight_decay=5e-4)

    # test(0, net, trainloader, testloader, criterion, device)
    epoch=0
    if not args.evaluate:
        for epoch in range(start_epoch, start_epoch + args.es):
            print('\nEpoch: %d   Learning rate: %f' % (epoch+1, optimizer.param_groups[0]['lr']))
            adjust_learning_rate(optimizer, epoch, args.lr)
            train_loss, train_acc = train(net,trainloader,optimizer,criterion,device)
            save_model(net, None, epoch, os.path.join(args.checkpoint,'last_model.pth'))
            test_loss, test_acc = 0, 0
            #
            logger.append([epoch+1, optimizer.param_groups[0]['lr'], train_loss, train_acc, test_loss, test_acc])

    test(epoch, net, trainloader, testloader, criterion, device)
    logger.close()
示例#18
0
def main():
    args.checkpoint = './checkpoints/mnist/' + args.arch
    if not os.path.isdir(args.checkpoint):
        mkdir_p(args.checkpoint)

    # folder to save figures
    args.plotfolder = './checkpoints/mnist/' + args.arch + '/plotter'
    if not os.path.isdir(args.plotfolder):
        mkdir_p(args.plotfolder)

    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    print(device)
    start_epoch = 0  # start from epoch 0 or last checkpoint epoch

    print('==> Preparing data..')
    transform = transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize((0.1307,), (0.3081,))
    ])

    trainset = MNIST(root='../../data', train=True, download=True, transform=transform,
                     train_class_num=args.train_class_num, test_class_num=args.test_class_num,
                     includes_all_train_class=args.includes_all_train_class)

    testset = MNIST(root='../../data', train=False, download=True, transform=transform,
                    train_class_num=args.train_class_num, test_class_num=args.test_class_num,
                    includes_all_train_class=args.includes_all_train_class)

    # data loader
    trainloader = torch.utils.data.DataLoader(trainset, batch_size=args.bs, shuffle=True, num_workers=4)
    testloader = torch.utils.data.DataLoader(testset, batch_size=args.bs, shuffle=False, num_workers=4)

    print('==> Building model..')
    net = Network(backbone=args.arch, num_classes=args.train_class_num,embed_dim=args.embed_dim)
    fea_dim = net.classifier.in_features
    net = net.to(device)

    if device == 'cuda':
        net = torch.nn.DataParallel(net)
        cudnn.benchmark = True

    criterion_softamx = nn.CrossEntropyLoss()
    criterion_centerloss = CenterLoss(num_classes=args.train_class_num, feat_dim=fea_dim).to(device)
    optimizer_softmax = optim.SGD(net.parameters(), lr=args.lr, momentum=0.9, weight_decay=5e-4)
    optimizer_centerloss = torch.optim.SGD(criterion_centerloss.parameters(), lr=args.center_lr, momentum=0.9,
                                           weight_decay=5e-4)

    if args.resume:
        # Load checkpoint.
        if os.path.isfile(args.resume):
            print('==> Resuming from checkpoint..')
            checkpoint = torch.load(args.resume)
            net.load_state_dict(checkpoint['net'])
            criterion_centerloss.load_state_dict(checkpoint['centerloss'])
            # best_acc = checkpoint['acc']
            # print("BEST_ACCURACY: "+str(best_acc))
            start_epoch = checkpoint['epoch']
            logger = Logger(os.path.join(args.checkpoint, 'log.txt'), resume=True)
        else:
            print("=> no checkpoint found at '{}'".format(args.resume))
    else:
        logger = Logger(os.path.join(args.checkpoint, 'log.txt'))
        logger.set_names(['Epoch', 'Total Loss','Softmax Loss', 'Center Loss', 'train Acc.'])


    if not args.evaluate:
        scheduler = lr_scheduler.StepLR(optimizer_softmax, step_size=20, gamma=0.1)
        for epoch in range(start_epoch, start_epoch + args.es):
            print('\nEpoch: %d   Learning rate: %f' % (epoch + 1, optimizer_softmax.param_groups[0]['lr']))
            train_loss, softmax_loss, center_loss, train_acc = train(net, trainloader, optimizer_softmax,
                                                                     optimizer_centerloss, criterion_softamx,
                                                                     criterion_centerloss, device)
            save_model(net, criterion_centerloss, epoch, os.path.join(args.checkpoint, 'last_model.pth'))
            # plot the training data
            if args.plot:
                plot_feature(net,criterion_centerloss, trainloader, device, args.plotfolder, epoch=epoch,
                         plot_class_num=args.train_class_num,maximum=args.plot_max, plot_quality=args.plot_quality)

            logger.append([epoch + 1, train_loss, softmax_loss, center_loss, train_acc])
            scheduler.step()
            test(net, testloader, device)



    if args.plot:
        plot_feature(net, criterion_centerloss, testloader, device, args.plotfolder, epoch="test",
                     plot_class_num=args.train_class_num+1, maximum=args.plot_max, plot_quality=args.plot_quality)
    logger.close()
示例#19
0
def main_stage2(stage1_dict):
    net1 = stage1_dict['net']
    thresholds = stage1_dict['distance']['thresholds']
    estimator = stage1_dict['estimator']
    print(f"\n===> Start Stage-2 training...\n")
    start_epoch = 0  # start from epoch 0 or last checkpoint epoch
    print('==> Building model..')
    net2 = DFPNet(backbone=args.arch,
                  num_classes=args.train_class_num,
                  embed_dim=args.embed_dim,
                  distance=args.distance,
                  similarity=args.similarity,
                  scaled=args.scaled,
                  thresholds=thresholds,
                  norm_centroid=args.norm_centroid,
                  amplifier=args.amplifier,
                  estimator=estimator)
    net2 = net2.to(device)
    if not args.evaluate and not os.path.isdir(args.stage2_resume):
        init_stage2_model(net1, net2)

    if device == 'cuda':
        net2 = torch.nn.DataParallel(net2)
        cudnn.benchmark = True

    if args.stage2_resume:
        # Load checkpoint.
        if os.path.isfile(args.stage2_resume):
            print('==> Resuming from checkpoint..')
            checkpoint = torch.load(args.stage2_resume)
            net2.load_state_dict(checkpoint['net'])
            # best_acc = checkpoint['acc']
            # print("BEST_ACCURACY: "+str(best_acc))
            start_epoch = checkpoint['epoch']
            logger = Logger(os.path.join(args.checkpoint, 'log_stage2.txt'),
                            resume=True)
        else:
            print("=> no checkpoint found at '{}'".format(args.resume))
    else:
        logger = Logger(os.path.join(args.checkpoint, 'log_stage2.txt'))
        logger.set_names([
            'Epoch', 'Train Loss', 'Similarity Loss', 'Distance in',
            'Distance out', 'Generate within', 'Generate 2origin', 'Train Acc.'
        ])

    # after resume
    criterion = DFPLoss2(alpha=args.alpha, beta=args.beta, theta=args.theta)
    optimizer = optim.SGD(net2.parameters(),
                          lr=args.stage1_lr,
                          momentum=0.9,
                          weight_decay=5e-4)

    if not args.evaluate:
        for epoch in range(start_epoch, args.stage2_es):
            print('\nStage_2 Epoch: %d   Learning rate: %f' %
                  (epoch + 1, optimizer.param_groups[0]['lr']))
            # Here, I didn't set optimizers respectively, just for simplicity. Performance did not vary a lot.
            adjust_learning_rate(optimizer, epoch, args.stage2_lr, step=10)
            train_out = stage2_train(net2, trainloader, optimizer, criterion,
                                     device)
            save_model(net2, epoch,
                       os.path.join(args.checkpoint, 'stage_2_last_model.pth'))
            logger.append([
                epoch + 1, train_out["train_loss"],
                train_out["loss_similarity"], train_out["distance_in"],
                train_out["distance_out"], train_out["generate_within"],
                train_out["generate_2orign"], train_out["accuracy"]
            ])
            if args.plot:
                plot_feature(net2,
                             args,
                             trainloader,
                             device,
                             args.plotfolder2,
                             epoch=epoch,
                             plot_class_num=args.train_class_num,
                             maximum=args.plot_max,
                             plot_quality=args.plot_quality,
                             norm_centroid=args.norm_centroid,
                             thresholds=thresholds)
                plot_feature(net2,
                             args,
                             testloader,
                             device,
                             args.plotfolder2,
                             epoch="test_" + str(epoch),
                             plot_class_num=args.train_class_num + 1,
                             maximum=args.plot_max,
                             plot_quality=args.plot_quality,
                             norm_centroid=args.norm_centroid,
                             thresholds=thresholds,
                             testmode=True)
        if args.plot:
            # plot the test set
            plot_feature(net2,
                         args,
                         testloader,
                         device,
                         args.plotfolder2,
                         epoch="test",
                         plot_class_num=args.train_class_num + 1,
                         maximum=args.plot_max,
                         plot_quality=args.plot_quality,
                         norm_centroid=args.norm_centroid,
                         thresholds=thresholds,
                         testmode=True)
        print(f"\nFinish Stage-2 training...\n")

    logger.close()

    # test2(net2, testloader, device)
    return net2
示例#20
0
def main():
    global best_prec1, args

    args.gpu = 0
    args.world_size = 1

    if args.distributed:
        args.gpu = args.local_rank % torch.cuda.device_count()
        torch.cuda.set_device(args.gpu)
        torch.distributed.init_process_group(backend='nccl',
                                             init_method='env://')
        args.world_size = torch.distributed.get_world_size()

    args.total_batch_size = args.world_size * args.batch_size

    if not os.path.isdir(args.checkpoint) and args.local_rank == 0:
        mkdir_p(args.checkpoint)

    if args.fp16:
        assert torch.backends.cudnn.enabled, "fp16 mode requires cudnn backend to be enabled."

    if args.static_loss_scale != 1.0:
        if not args.fp16:
            print(
                "Warning:  if --fp16 is not used, static_loss_scale will be ignored."
            )

    # create model
    if args.pretrained:
        print("=> using pre-trained model '{}'".format(args.arch))
        model = models.__dict__[args.arch](pretrained=True)
    else:
        print("=> creating model '{}'".format(args.arch))
        model = Network(backbone=args.arch, num_classes=args.train_class_num)

    model = model.cuda()
    if args.fp16:
        model = network_to_half(model)
    if args.distributed:
        # shared param/delay all reduce turns off bucketing in DDP, for lower latency runs this can improve perf
        # for the older version of APEX please use shared_param, for newer one it is delay_allreduce
        model = DDP(model, delay_allreduce=True)

    # define loss function (criterion) and optimizer
    criterion = nn.CrossEntropyLoss().cuda()

    optimizer = torch.optim.SGD(model.parameters(),
                                args.lr,
                                momentum=args.momentum,
                                weight_decay=args.weight_decay)
    if args.fp16:
        optimizer = FP16_Optimizer(optimizer,
                                   static_loss_scale=args.static_loss_scale,
                                   dynamic_loss_scale=args.dynamic_loss_scale,
                                   verbose=False)

    # optionally resume from a checkpoint
    title = 'ImageNet-' + args.arch
    if args.resume:
        if os.path.isfile(args.resume):
            print("=> loading checkpoint '{}'".format(args.resume))
            checkpoint = torch.load(
                args.resume,
                map_location=lambda storage, loc: storage.cuda(args.gpu))
            args.start_epoch = checkpoint['epoch']
            best_prec1 = checkpoint['best_prec1']
            model.load_state_dict(checkpoint['state_dict'])
            optimizer.load_state_dict(checkpoint['optimizer'])
            print("=> loaded checkpoint '{}' (epoch {})".format(
                args.resume, checkpoint['epoch']))
            if args.local_rank == 0:
                logger = Logger(os.path.join(args.checkpoint, 'log.txt'),
                                title=title,
                                resume=True)
        else:
            print("=> no checkpoint found at '{}'".format(args.resume))
    else:
        if args.local_rank == 0:
            logger = Logger(os.path.join(args.checkpoint, 'log.txt'),
                            title=title)
            logger.set_names([
                'Learning Rate', 'Train Loss', 'Valid Loss', 'Train Acc.',
                'Valid Acc.', 'Valid Top5.'
            ])

    traindir = os.path.join(args.data, 'train')
    valdir = os.path.join(args.data, 'val')

    if (args.arch == "inception_v3"):
        crop_size = 299
        val_size = 320  # I chose this value arbitrarily, we can adjust.
    else:
        crop_size = 224
        val_size = 256

    pipe = HybridTrainPipe(batch_size=args.batch_size,
                           num_threads=args.workers,
                           device_id=args.local_rank,
                           data_dir=traindir,
                           crop=crop_size,
                           dali_cpu=args.dali_cpu)
    pipe.build()
    train_loader = DALIClassificationIterator(
        pipe, size=int(pipe.epoch_size("Reader") / args.world_size))

    # pipe = HybridValPipe(batch_size=args.batch_size, num_threads=args.workers, device_id=args.local_rank, data_dir=valdir, crop=crop_size, size=val_size)
    # pipe.build()
    # val_loader = DALIClassificationIterator(pipe, size=int(pipe.epoch_size("Reader") / args.world_size))

    # if args.evaluate:
    #     validate(val_loader, model, criterion)
    #     return

    total_time = AverageMeter()
    for epoch in range(args.start_epoch, args.epochs):
        # train for one epoch
        adjust_learning_rate(optimizer, epoch, args)

        if args.local_rank == 0:
            print('\nEpoch: [%d | %d] LR: %f' %
                  (epoch + 1, args.epochs, optimizer.param_groups[0]['lr']))

        [train_loss, train_acc,
         avg_train_time] = train(train_loader, model, criterion, optimizer,
                                 epoch)
        total_time.update(avg_train_time)
        # evaluate on validation set
        # [test_loss, prec1, prec5] = validate(val_loader, model, criterion)

        # remember best prec@1 and save checkpoint
        if args.local_rank == 0:
            # append logger file
            # logger.append([optimizer.param_groups[0]['lr'], train_loss, test_loss, train_acc, prec1, prec5])
            logger.append([
                optimizer.param_groups[0]['lr'], train_loss, 0.0, train_acc,
                0.0, 0.0
            ])

            # is_best = prec1 > best_prec1
            is_best = False
            # best_prec1 = max(prec1, best_prec1)
            save_checkpoint(
                {
                    'epoch': epoch,
                    'arch': args.arch,
                    'state_dict': model.state_dict(),
                    'best_prec1': best_prec1,
                    'optimizer': optimizer.state_dict(),
                },
                is_best,
                checkpoint=args.checkpoint,
                filename="checkpoint.pth.tar")
            # if epoch == args.epochs - 1:
            #     print('##Top-1 {0}\n'
            #           '##Top-5 {1}\n'
            #           '##Perf  {2}'.format(prec1, prec5, args.total_batch_size / total_time.avg))

        # reset DALI iterators
        train_loader.reset()
        # val_loader.reset()

    if args.local_rank == 0:
        logger.close()
示例#21
0
def main_stage2(stage1_dict):
    net1 = stage1_dict["net"]
    thresholds = stage1_dict["distance"]["thresholds"]

    print(f"\n===> Start Stage-2 training...\n")
    start_epoch = 0
    print('==> Building model..')
    # net2 = DFPNet(backbone=args.arch, num_classes=args.train_class_num, embed_dim=args.embed_dim,
    #              distance=args.distance, scaled=args.scaled, cosine_weight=args.cosine_weight,thresholds=thresholds)
    net2 = DFPNet(backbone=args.arch, num_classes=args.train_class_num, embed_dim=args.embed_dim,
                  distance=args.distance, scaled=args.scaled, cosine_weight=args.cosine_weight, thresholds=thresholds)
    net2 = net2.to(device)

    criterion_dis = DFPLossGeneral(beta=args.beta, sigma=args.sigma,gamma=args.gamma)
    optimizer = optim.SGD(net2.parameters(), lr=args.stage2_lr, momentum=0.9, weight_decay=5e-4)

    if not args.evaluate:
        init_stage2_model(net1, net2)
    if device == 'cuda':
        net2 = torch.nn.DataParallel(net2)
        cudnn.benchmark = True

    if args.stage2_resume:
        # Load checkpoint.
        if os.path.isfile(args.stage2_resume):
            print('==> Resuming from checkpoint..')
            checkpoint = torch.load(args.stage1_resume)
            net2.load_state_dict(checkpoint['net'])
            start_epoch = checkpoint['epoch']
            logger = Logger(os.path.join(args.checkpoint, 'log_stage2.txt'), resume=True)
        else:
            print("=> no checkpoint found at '{}'".format(args.resume))
    else:
        logger = Logger(os.path.join(args.checkpoint, 'log_stage2.txt'))
        logger.set_names(['Epoch', 'Train Loss', 'Within Loss', 'Between Loss', 'Within-Gen Loss', 'Between-Gen Loss',
                          'Random loss', 'Train Acc.'])

    if not args.evaluate:
        for epoch in range(start_epoch, args.stage2_es):
            adjust_learning_rate(optimizer, epoch, args.stage2_lr, step=20)
            print('\nStage_2 Epoch: %d | Learning rate: %f ' % (epoch + 1, optimizer.param_groups[0]['lr']))
            train_out = stage2_train(net2, trainloader, optimizer, criterion_dis, device)
            save_model(net2, epoch, os.path.join(args.checkpoint, 'stage_2_last_model.pth'))
            # ['Epoch', 'Train Loss', 'Softmax Loss', 'Distance Loss',
            # 'Within Loss', 'Between Loss','Cen2cen loss', 'Train Acc.']
            logger.append([epoch + 1, train_out["dis_loss_total"], train_out["dis_loss_within"],
                           train_out["dis_loss_between"],train_out["dis_loss_within_gen"],
                           train_out["dis_loss_between_gen"], train_out["dis_loss_cen2cen"], train_out["accuracy"]])
            if args.plot:
                plot_feature(net2, trainloader, device, args.plotfolder2, epoch=epoch,
                             plot_class_num=args.train_class_num, maximum=args.plot_max, plot_quality=args.plot_quality)
    if args.plot:
        # plot the test set
        plot_feature(net2, testloader, device, args.plotfolder2, epoch="test",
                     plot_class_num=args.train_class_num + 1, maximum=args.plot_max, plot_quality=args.plot_quality)

    # calculating distances for last epoch
    # distance_results = plot_distance(net2, trainloader, device, args)

    logger.close()
    print(f"\nFinish Stage-2 training...\n")
    print("===> Evaluating ...")
    stage1_test(net2, testloader, device)
    return net2
示例#22
0
def main():
    start_epoch = 0
    best_loss = 9999999.99

    # Model
    print('==> Building model..')
    net = VanillaVAE(in_channels=1, latent_dim=args.latent_dim)
    net = net.to(device)

    if device == 'cuda':
        # Considering the data scale and model, it is unnecessary to use DistributedDataParallel
        # which could speed up the training and inference compared to DataParallel
        net = torch.nn.DataParallel(net)
        cudnn.benchmark = True

    optimizer = optim.Adam(net.parameters(), lr=args.lr, weight_decay=args.wd)
    scheduler = optim.lr_scheduler.ExponentialLR(optimizer,
                                                 gamma=args.scheduler_gamma)

    if args.resume:
        # Load checkpoint.
        if os.path.isfile(args.resume):
            checkpoint = torch.load(args.resume)
            net.load_state_dict(checkpoint['net'])
            optimizer.load_state_dict(checkpoint['optimizer'])
            logger = Logger(os.path.join(args.checkpoint, 'log.txt'),
                            resume=True)
            print('==> Resuming from checkpoint, loaded..')
        else:
            print("==> No checkpoint found at '{}'".format(args.resume))
    else:
        logger = Logger(os.path.join(args.checkpoint, 'log.txt'))
        logger.set_names(
            ['Epoch', 'LR', 'Train Loss', 'Recons Loss', 'KLD Loss'])

    if not args.evaluate:
        # training
        print("==> start training..")
        for epoch in range(start_epoch, args.es):
            print('\nStage_1 Epoch: %d | Learning rate: %f ' %
                  (epoch + 1, scheduler.get_last_lr()[-1]))
            train_out = train(net, trainloader,
                              optimizer)  # {train_loss, recons_loss, kld_loss}
            save_model(net, optimizer, epoch,
                       os.path.join(args.checkpoint, 'checkpoint.pth'))
            if train_out["train_loss"] < best_loss:
                save_model(net,
                           optimizer,
                           epoch,
                           os.path.join(args.checkpoint,
                                        'checkpoint_best.pth'),
                           loss=train_out["train_loss"])
                best_loss = train_out["train_loss"]
            logger.append([
                epoch + 1,
                scheduler.get_last_lr()[-1], train_out["train_loss"],
                train_out["recons_loss"], train_out["kld_loss"]
            ])
            scheduler.step()
        logger.close()
        print(f"\n==> Finish training..\n")

    print("===> start evaluating ...")
    generate_images(net, valloader, name="test_reconstruct")
    sample_images(net, name="test_randsample")