def main(): # release gpu memory device = torch.device("cuda" if torch.cuda.is_available() else "cpu") torch.cuda.empty_cache() # torch.cuda.ipc_collect() # get the list of csv training files # training_csv_files = glob(os.path.join(FLAGS.train_data_path, "*.csv")) # build training, validation, and test data loaders print(' Preparing the data!') # Fix the random seed for identical experiments # train_loader, test_loader, test_length = \ # get_dataloaders(training_csv_files[0: FLAGS.num_classes], FLAGS.test_data_path, FLAGS.num_data_per_class) # build test and train for cifar100 train_loader = get_training_dataloader( settings.CIFAR100_TRAIN_MEAN, settings.CIFAR100_TRAIN_STD, batch_size=FLAGS.batch_size, ) test_loader = get_test_dataloader( settings.CIFAR100_TRAIN_MEAN, settings.CIFAR100_TRAIN_STD, batch_size=FLAGS.batch_size, ) test_length = len(test_loader.dataset) kwargs = { "kernels_path_7": FLAGS.kernels_path_7, "kernels_path_3": FLAGS.kernels_path_3, "num_kernels_7": FLAGS.num_kernels_7, "num_kernels_3": FLAGS.num_kernels_3, "num_classes": FLAGS.num_classes } loss_function = nn.CrossEntropyLoss() for conv_model in ["Conv2d", "Conv2dRF"]: for resnet_arch in ["resnet18", "resnet34", "resnet50"]: name = resnet_arch + '_' + conv_model kwargs["conv_model"] = conv_model model = getattr(resnet_mod, resnet_arch)(**kwargs) optimizer = torch.optim.Adam(model.parameters(), lr=FLAGS.learning_rate) if torch.cuda.device_count() > 1: model = nn.DataParallel(model) print(' Started training!') for run_id in range(FLAGS.num_runs): torch.cuda.empty_cache() # model.weights_init() if torch.cuda.device_count() > 1: model.module.weights_init() else: model.weights_init() model.to(device) for epoch in range(FLAGS.num_epochs): model.train() for batch_idx, (images_, labels_) in tqdm(enumerate(train_loader)): # print("batch_{}: {}".format(batch_idx, torch.cuda.memory_cached()/1e6)) train_function(model, optimizer, loss_function, device, images_, labels_) if (batch_idx + 1) % FLAGS.log_interval == 0 or ( batch_idx + 1) == len(train_loader): # test_function( # model, test_loader, device, test_length, FLAGS.batch_size, # FLAGS.train_data_path, # os.path.join(FLAGS.save_path, "submissions", # name+'_r={}_e={}_idx={}.csv'.format(run_id, epoch+1, batch_idx+1))) model.train() # reset back to train mode # Table for output of validation val_output = np.zeros((FLAGS.num_runs, 102)) val_oa, val_aa, val_pca = val_full(model, device, test_loader, 100) val_output[run_id, 0] = val_oa val_output[run_id, 1] = val_aa val_output[run_id, 2:] = val_pca # saving the model torch.save({'model_state_dict': model.state_dict()}, os.path.join( FLAGS.save_path, "models", "{}.pt".format(name + '_r={}'.format(run_id + 1)))) np.save( os.path.join(FLAGS.save_path, "validation_{}.npy".format(name)), val_output)
default=0.2, help="decrease rate when reach milestones", type=float) parser.add_argument("--phases", default=WARM_PHASES, help="epochs for warming model", type=int) parser.add_argument("--old", help="old model path") parser.add_argument("--attacker", help="attackers, now support FGSMAttack and PGDAttack") parser.add_argument("--epsilon", help="epsilons", type=float) args = parser.parse_args() if args.pattern in ("train", "attack"): train_loader = get_training_dataloader(args.dataset, args.batch, args.num_worker) test_loader = get_test_dataloader(args.dataset, args.batch, args.num_worker) if args.dataset == "cifar100": model = resnet.resnet50() elif args.dataset == "cifar10": model = resnet.resnet50(num_classes=10) else: raise Exception trainer = Trainer(model, train_loader, test_loader, args.device, args.lr, args.momentum, args.epochs, args.batch, DEFAULT_PARALLELISM, MILESTONES, args.gamma, args.phases) if args.pattern == "train": trainer.train(args.save_path)
def main_worker(gpu, ngpus_per_node, args): global best_acc1 args.gpu = gpu if args.gpu is not None: print("Use GPU: {} for training".format(args.gpu)) if args.distributed: if args.dist_url == "env://" and args.rank == -1: args.rank = int(os.environ["RANK"]) if args.multiprocessing_distributed: # For multiprocessing distributed training, rank needs to be the # global rank among all the processes args.rank = args.rank * ngpus_per_node + gpu dist.init_process_group(backend=args.dist_backend, init_method=args.dist_url, world_size=args.world_size, rank=args.rank) # create model if args.pretrained: print("=> using pre-trained model '{}'".format(args.arch)) model = models.__dict__[args.arch](pretrained=True) else: print("=> creating model '{}'".format(args.arch)) model = models.__dict__[args.arch]() if args.distributed: # For multiprocessing distributed, DistributedDataParallel constructor # should always set the single device scope, otherwise, # DistributedDataParallel will use all available devices. if args.gpu is not None: torch.cuda.set_device(args.gpu) model.cuda(args.gpu) # When using a single GPU per process and per # DistributedDataParallel, we need to divide the batch size # ourselves based on the total number of GPUs we have args.batch_size = int(args.batch_size / ngpus_per_node) args.workers = int( (args.workers + ngpus_per_node - 1) / ngpus_per_node) model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.gpu]) else: model.cuda() # DistributedDataParallel will divide and allocate batch_size to all # available GPUs if device_ids are not set model = torch.nn.parallel.DistributedDataParallel(model) elif args.gpu is not None: torch.cuda.set_device(args.gpu) model = model.cuda(args.gpu) else: # DataParallel will divide and allocate batch_size to all available GPUs if args.arch.startswith('alexnet') or args.arch.startswith('vgg'): model.features = torch.nn.DataParallel(model.features) model.cuda() else: model = torch.nn.DataParallel(model).cuda() # define loss function (criterion) and optimizer criterion = nn.CrossEntropyLoss().cuda(args.gpu) optimizer = torch.optim.SGD(model.parameters(), args.lr, momentum=args.momentum, weight_decay=args.weight_decay) # optionally resume from a checkpoint if args.resume: if os.path.isfile(args.resume): print("=> loading checkpoint '{}'".format(args.resume)) if args.gpu is None: checkpoint = torch.load(args.resume) else: # Map model to be loaded to specified single gpu. loc = 'cuda:{}'.format(args.gpu) checkpoint = torch.load(args.resume, map_location=loc) args.start_epoch = checkpoint['epoch'] best_acc1 = checkpoint['best_acc1'] if args.gpu is not None: # best_acc1 may be from a checkpoint from a different GPU best_acc1 = best_acc1.to(args.gpu) model.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) print("=> loaded checkpoint '{}' (epoch {})".format( args.resume, checkpoint['epoch'])) else: print("=> no checkpoint found at '{}'".format(args.resume)) cudnn.benchmark = True # Data loading code # traindir = os.path.join(args.data, 'train') # valdir = os.path.join(args.data, 'val') # normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], # std=[0.229, 0.224, 0.225]) # train_dataset = datasets.ImageFolder( # traindir, # transforms.Compose([ # transforms.RandomResizedCrop(224), # transforms.RandomHorizontalFlip(), # transforms.ToTensor(), # normalize, # ])) if args.distributed: train_sampler = torch.utils.data.distributed.DistributedSampler( train_dataset) else: train_sampler = None # train_loader = torch.utils.data.DataLoader( # train_dataset, batch_size=args.batch_size, shuffle=(train_sampler is None), # num_workers=args.workers, pin_memory=True, sampler=train_sampler) # val_loader = torch.utils.data.DataLoader( # datasets.ImageFolder(valdir, transforms.Compose([ # transforms.Resize(256), # transforms.CenterCrop(224), # transforms.ToTensor(), # normalize, # ])), # batch_size=args.batch_size, shuffle=False, # num_workers=args.workers, pin_memory=True) train_loader = utils.get_training_dataloader(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) val_loader = utils.get_test_dataloader(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) if args.evaluate: validate(val_loader, model, criterion, args) return for epoch in range(args.start_epoch, args.epochs): if args.distributed: train_sampler.set_epoch(epoch) adjust_learning_rate(optimizer, epoch, args) # train for one epoch train(train_loader, model, criterion, optimizer, epoch, args) # evaluate on validation set acc1 = validate(val_loader, model, criterion, args) # remember best acc@1 and save checkpoint is_best = acc1 > best_acc1 best_acc1 = max(acc1, best_acc1) if not args.multiprocessing_distributed or ( args.multiprocessing_distributed and args.rank % ngpus_per_node == 0): save_checkpoint( { 'epoch': epoch + 1, 'arch': args.arch, 'state_dict': model.state_dict(), 'best_acc1': best_acc1, 'optimizer': optimizer.state_dict(), }, is_best, filename="model/" + args.modelname + "_" + str(epoch))
args = parser.parse_args() os.mkdir(args.save_dir) if __name__ == '__main__': torch.manual_seed(args.random_seed) net = get_network(args) # random_shuffle = (torch.rand(50000) > (1-args.random_rate)) * torch.randint(1, 100, (50000,)) #data preprocessing: cifar100_training_loader = get_training_dataloader( settings.CIFAR100_TRAIN_MEAN, settings.CIFAR100_TRAIN_STD, num_workers=4, batch_size=args.b, shuffle=True, random_rate = args.random_rate ) cifar100_test_loader = get_test_dataloader( settings.CIFAR100_TRAIN_MEAN, settings.CIFAR100_TRAIN_STD, num_workers=4, batch_size=args.b, shuffle=True ) loss_function = nn.CrossEntropyLoss() optimizer = optim.SGD(net.parameters(), lr=args.lr, momentum=0.9, weight_decay=5e-4) train_scheduler = optim.lr_scheduler.MultiStepLR(optimizer, milestones=settings.MILESTONES, gamma=0.2) #learning rate decay
def main(): global best_acc start_epoch = args.start_epoch # start from epoch 0 or last checkpoint epoch if not os.path.isdir(args.checkpoint): mkdir_p(args.checkpoint) print('==> Preparing dataset %s' % args.dataset) if args.dataset == 'cifar100': training_loader = get_training_dataloader( settings.CIFAR100_TRAIN_MEAN, settings.CIFAR100_TRAIN_STD, num_workers=4, batch_size=args.train_batch, shuffle=True ) test_loader = get_test_dataloader( settings.CIFAR100_TRAIN_MEAN, settings.CIFAR100_TRAIN_STD, num_workers=4, batch_size=args.test_batch, shuffle=False ) num_classes=100 else: training_loader = get_training_dataloader_10( settings.CIFAR10_TRAIN_MEAN, settings.CIFAR10_TRAIN_STD, num_workers=4, batch_size=args.train_batch, shuffle=True ) test_loader = get_test_dataloader_10( settings.CIFAR10_TRAIN_MEAN, settings.CIFAR10_TRAIN_STD, num_workers=4, batch_size=args.test_batch, shuffle=False ) num_classes=10 #data preprocessing: print("==> creating model '{}'".format(args.arch)) model = get_network(args,num_classes=num_classes) model = torch.nn.DataParallel(model).cuda() cudnn.benchmark = True print(' Total params: %.2fM' % (sum(p.numel() for p in model.parameters()) / 1000000.0)) criterion = nn.CrossEntropyLoss() #optimizer=optim.Adam(model.parameters(),lr=args.lr,weight_decay=5e-4) optimizer = optim.SGD(model.parameters(), lr=args.lr, momentum=0.9, weight_decay=5e-4) criterion1 = S_softmax.SimSoftmaxLoss(num_classes=num_classes,lr=10*optimizer.param_groups[0]['lr']) title = 'cifar-10-' + args.arch if args.resume: # Load checkpoint. print('==> Resuming from checkpoint..') assert os.path.isfile(args.resume), 'Error: no checkpoint directory found!' args.checkpoint = os.path.dirname(args.resume) checkpoint = torch.load(args.resume) best_acc = checkpoint['best_acc'] start_epoch = checkpoint['epoch'] model.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) logger = Logger(os.path.join(args.checkpoint, 'log.txt'), title=title, resume=True) else: logger = Logger(os.path.join(args.checkpoint, 'log.txt'), title=title) logger.set_names(['Learning Rate', 'Train Loss', 'Valid Loss', 'Train Acc.', 'Valid Acc.']) train_scheduler = optim.lr_scheduler.MultiStepLR(optimizer, milestones=args.schedule, gamma=0.2) #learning rate decay iter_per_epoch = len(training_loader) warmup_scheduler = WarmUpLR(optimizer, iter_per_epoch * args.warm) for epoch in range(start_epoch, args.epochs): if epoch > args.warm: train_scheduler.step(epoch) train_loss, train_acc=train(training_loader, model, warmup_scheduler,criterion, criterion1, optimizer, epoch, use_cuda) test_loss, test_acc = eval_training(test_loader, model, criterion, epoch, use_cuda) logger.append([optimizer.param_groups[0]['lr'], train_loss, test_loss, train_acc, test_acc]) # save model is_best = test_acc > best_acc best_acc = max(test_acc, best_acc) save_checkpoint({ 'epoch': epoch + 1, 'state_dict': model.state_dict(), 'acc': test_acc, 'best_acc': best_acc, 'optimizer': optimizer.state_dict(), }, is_best, checkpoint=args.checkpoint) logger.close() # logger.plot() # savefig(os.path.join(args.checkpoint, 'log.eps')) print('Best acc:') print(best_acc)
help='whether shuffle the dataset') parser.add_argument('-warm', type=int, default=1, help='warm up training phase') parser.add_argument('-lr', type=float, default=0.1, help='initial learning rate') args = parser.parse_args() flor.namespace_stack.test_force(args, 'args') net = get_network(args, use_gpu=args.gpu) flor.namespace_stack.test_force(net, 'net') cifar100_training_loader = get_training_dataloader( settings.CIFAR100_TRAIN_MEAN, settings.CIFAR100_TRAIN_STD, num_workers=args.w, batch_size=args.b, shuffle=args.s) flor.namespace_stack.test_force(cifar100_training_loader, 'cifar100_training_loader') cifar100_test_loader = get_test_dataloader(settings.CIFAR100_TRAIN_MEAN, settings.CIFAR100_TRAIN_STD, num_workers=args.w, batch_size=args.b, shuffle=args.s) flor.namespace_stack.test_force(cifar100_test_loader, 'cifar100_test_loader') iter_per_epoch = len(cifar100_training_loader) flor.namespace_stack.test_force(iter_per_epoch, 'iter_per_epoch') loss_function = nn.CrossEntropyLoss() flor.namespace_stack.test_force(loss_function, 'loss_function')
correct += preds.eq(labels).sum() print('Test set: Average_loss: {:.4f}, Accuracy: {:.4f}'.format( test_loss / len(cifar10_test_loader.dataset), correct.float() / len(cifar10_test_loader.dataset) )) return correct.float() / len(cifar10_test_loader.dataset) if __name__ == '__main__': training_loader = get_training_dataloader(args.data_root, (0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010), num_workers=args.w, batch_size=args.b, shuffle=args.s ) cifar10_test_loader = get_test_dataloader(args.data_root, (0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010), num_workers=args.w, batch_size=args.b, shuffle=args.s ) net = Cnn(num_class=10).cuda() loss_function = nn.CrossEntropyLoss() optimizer = optim.SGD(net.parameters(), lr=args.lr)
def main(args): print(f'started: {args}') torch.cuda.set_device(args.gpu) cifar100_training_loader = get_training_dataloader( settings.CIFAR100_TRAIN_MEAN, settings.CIFAR100_TRAIN_STD, num_workers=4, batch_size=args.b, shuffle=True) cifar100_test_loader = get_test_dataloader(settings.CIFAR100_TRAIN_MEAN, settings.CIFAR100_TRAIN_STD, num_workers=4, batch_size=args.b, shuffle=True) net = resnet18(with_permute_adain=(args.padain > 0), p_adain=args.padain) net = net.cuda() print(net) loss_function = nn.CrossEntropyLoss() optimizer = optim.SGD(net.parameters(), lr=args.lr, momentum=0.9, weight_decay=5e-4) train_scheduler = optim.lr_scheduler.MultiStepLR( optimizer, milestones=settings.MILESTONES, gamma=0.2) # learning rate decay iter_per_epoch = len(cifar100_training_loader) warmup_scheduler = WarmUpLR(optimizer, iter_per_epoch * args.warm) checkpoint_path = os.path.join(settings.CHECKPOINT_PATH, 'cifar100', args.net, str(args.padain), settings.TIME_NOW) # use tensorboard if not os.path.exists(settings.LOG_DIR): os.mkdir(settings.LOG_DIR) writer = SummaryWriter( log_dir=os.path.join(settings.LOG_DIR, args.net + '_padain' + str(args.padain), settings.TIME_NOW)) input_tensor = torch.Tensor(1, 3, 32, 32).cuda() writer.add_graph(net, input_tensor) # create checkpoint folder to save model if not os.path.exists(checkpoint_path): os.makedirs(checkpoint_path) checkpoint_path = os.path.join(checkpoint_path, '{net}-{type}.pth') best_acc = 0.0 for epoch in range(1, settings.EPOCH): if epoch > args.warm: train_scheduler.step(epoch) train(net, cifar100_training_loader, warmup_scheduler, optimizer, loss_function, writer, epoch, args.warm, args.b) acc = eval_training(net, cifar100_test_loader, loss_function, writer, epoch) # start to save best performance model after learning rate decay to 0.01 if epoch > settings.MILESTONES[1] and best_acc < acc: torch.save(net.state_dict(), checkpoint_path.format(net=args.net, type='best')) best_acc = acc continue if not epoch % settings.SAVE_EPOCH: torch.save(net.state_dict(), checkpoint_path.format(net=args.net, type='other')) writer.close()
parser.add_argument('-resume', action='store_true', default=False, help='resume training') args = parser.parse_args() typea = 8 typeb = 9 net = get_network(args) # 训练集 trainpath = '/home/steadysjtu/classification/train_1/' # 测试集 testpath = '/home/steadysjtu/classification/test_1/' # 细胞子图路径 # data preprocessing: # 预处理https://www.cnblogs.com/wanghui-garcia/p/11448460.html cell_training_loader = get_training_dataloader( path=trainpath, mean=cell_train_mean, std=cell_train_std, num_workers=8, batch_size=1, shuffle=True ) cell_test_loader = get_test_dataloader( path=testpath, mean=cell_train_mean, std=cell_train_std, num_workers=16, batch_size=1, shuffle=True ) cell_train_test_loader = get_test_dataloader( path=trainpath, mean=cell_train_mean, std=cell_train_std,
0.4409178433670343) CIFAR100_TRAIN_STD = (0.2673342858792401, 0.2564384629170883, 0.27615047132568404) #data preprocessing: if args.new_data_loader: cifar100_training_loader = DataLoaderConstructor( train=True, batch_size=args.b, workers=args.w).data_loader cifar100_test_loader = DataLoaderConstructor( train=False, batch_size=args.b, workers=args.w).data_loader else: cifar100_training_loader = get_training_dataloader( CIFAR100_TRAIN_MEAN, CIFAR100_TRAIN_STD, num_workers=args.w, batch_size=args.b, shuffle=args.s, disable_rotate=args.disable_rotate) cifar100_test_loader = get_test_dataloader(CIFAR100_TRAIN_MEAN, CIFAR100_TRAIN_STD, num_workers=args.w, batch_size=args.b, shuffle=args.s) loss_function = nn.CrossEntropyLoss() optimizer = optim.SGD(net.parameters(), lr=args.lr, momentum=0.9, weight_decay=weight_decay)
}) with open(os.path.join(os.path.dirname(save_path), "info.json"), "w", encoding="utf8") as f: json.dump(info, f) torch.save(self.model.state_dict(), f"{save_path}-best") @staticmethod def train_tl(origin_model_path, save_path, train_loader, test_loader, device, choice="resnet50"): print(f"transform learning on model: {origin_model_path}") model = TLResNet.create_model(choice) model.load_model(origin_model_path) trainer = Trainer(model=model, train_loader=train_loader, test_loader=test_loader, device=device) trainer.train(save_path) if __name__ == '__main__': train_loader = get_training_dataloader("cifar100") test_loader = get_test_dataloader("cifar100") model = resnet.resnet50() trainer = Trainer(model, train_loader, test_loader) trainer.train("1")
net = get_network(args) #data preprocessing: if args.task == "cifar100": mean = settings.CIFAR100_TRAIN_MEAN std = settings.CIFAR100_TRAIN_STD elif args.task == "cifar10": mean = (0.4914, 0.4822, 0.4465) std = (0.2023, 0.1994, 0.2010) else: print("invalid task!!") cifar_training_loader = get_training_dataloader(mean, std, num_workers=4, batch_size=args.b, shuffle=True, alpha=0.0, task=args.task, da=True) cifar_test_loader = get_test_dataloader(mean, std, num_workers=4, batch_size=args.b, shuffle=False, task=args.task) #test training acc cifar_train_test_loader = get_test_dataloader(mean, std, num_workers=4, batch_size=args.b,
test_loss / len(cifar100_test_loader.dataset), epoch) writer.add_scalar('Test/Accuracy', correct.float() / len(cifar100_test_loader.dataset), epoch) return correct.float() / len(cifar100_test_loader.dataset) if __name__ == '__main__': net = get_network() #data preprocessing: cifar100_training_loader = get_training_dataloader(CIFAR100_TRAIN_MEAN, CIFAR100_TRAIN_STD, num_workers=4, batch_size=32, shuffle=True) cifar100_test_loader = get_test_dataloader(CIFAR100_TRAIN_MEAN, CIFAR100_TRAIN_STD, num_workers=4, batch_size=32, shuffle=True) loss_function = nn.CrossEntropyLoss() optimizer = optim.SGD(net.parameters(), lr=0.01, momentum=0.9, weight_decay=5e-4) # optimizer = optim.Adam(net.parameters(),lr=0.01, weight_decay=5e-4)