def get_model(arch, wts_path): if arch == 'alexnet': model = AlexNet() model.fc = nn.Sequential() load_weights(model, wts_path) elif arch == 'pt_alexnet': model = models.alexnet() classif = list(model.classifier.children())[:5] model.classifier = nn.Sequential(*classif) load_weights(model, wts_path) elif arch == 'mobilenet': model = MobileNetV2() model.fc = nn.Sequential() load_weights(model, wts_path) elif 'resnet' in arch: model = models.__dict__[arch]() model.fc = nn.Sequential() load_weights(model, wts_path) else: raise ValueError('arch not found: ' + arch) for p in model.parameters(): p.requires_grad = False return model
def train(train_dataset, val_dataset, configs): train_loader = torch.utils.data.DataLoader( train_dataset, batch_size = configs["batch_size"], shuffle = True ) val_loader = torch.utils.data.DataLoader( val_dataset, batch_size = configs["batch_size"], shuffle = False ) device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") model = AlexNet().to(device) criterion = nn.CrossEntropyLoss() optimizer = optim.Adam(params = model.parameters(), lr = configs["lr"]) for epoch in range(configs["epochs"]): model.train() running_loss = 0.0 correct = 0 for i, (inputs, labels) in tqdm(enumerate(train_loader)): inputs, labels = inputs.to(device), labels.squeeze().to(device) optimizer.zero_grad() outputs = model(inputs) loss = criterion(outputs, labels) loss.backward() optimizer.step() _, predicted = torch.max(outputs.data, 1) correct += (predicted == labels).sum().item() running_loss += loss.item() print("[%d] loss: %.4f" % (epoch + 1, running_loss / train_dataset.__len__())) model.eval() correct = 0 with torch.no_grad(): for i, (inputs, labels) in tqdm(enumerate(val_loader)): inputs, labels = inputs.to(device), labels.squeeze().to(device) outputs = model(inputs) _, predicted = torch.max(outputs.data, 1) correct += (predicted == labels).sum().item() print("Accuracy of the network on the %d test images: %.4f %%" % (val_dataset.__len__(), 100. * correct / val_dataset.__len__())) torch.save(model.state_dict(), "/opt/output/model.pt")
def jobSetup(): device = torch.device("cuda" if torch.cuda.is_available() else "cpu") exit = False # Exit jobsetup Boolean joblist = [] while (not exit): # These booleans control the state of the menu SessionTypeBool = True ModelTypeBool = True EpochBool = True TrainBatchBool = True OptimBool = True TestBatchBool = True jobBool = True #--------------------------------------Model Selection--------------------------------------# while (ModelTypeBool): modeltype = input( " a.Alexnet \n b.VGG16 \n c.ResNext \n d.VGGv2\n >") if (modeltype != 'a' and modeltype != 'b' and modeltype != 'c' and modeltype != 'd'): print("Please input a valid model input") ModelTypeBool = True if (modeltype == 'a'): model = AlexNet() modeldict = 'Alexnet-model.pt' modelname = "Alexnet" valtrain = 32 valtest = 136 optimizer = optim.Adam(model.parameters(), lr=0.001) ModelTypeBool = False elif (modeltype == 'b'): model = VGG16() modeldict = 'VGG16-model.pt' modelname = "VGG16" valtrain = 32 valtest = 136 optimizer = optim.SGD(model.parameters(), lr=0.001) ModelTypeBool = False elif (modeltype == 'c'): model = resnext50_32x4d() modeldict = 'ResNext50-model.pt' modelname = "ResNext50" valtrain = 32 valtest = 136 optimizer = optim.Adam(model.parameters(), lr=0.001) ModelTypeBool = False elif (modeltype == 'd'): model = VGG_v2() modeldict = 'VGGv2-model.pt' modelname = "VGGv2" valtrain = 32 valtest = 136 optimizer = optim.Adam(model.parameters(), lr=0.001) ModelTypeBool = False print(modelname + ": chosen") #------------------------------------Session Selection--------------------------------------# while (SessionTypeBool): sessiontype = input( " a.Start Training a new model \n b.Test the model \n >") if (sessiontype != 'a' and sessiontype != 'b' and sessiontype != 'c'): print("Please input a valid session input") SessionTypeBool = True if (sessiontype == 'a'): SessionTypeBool = False print("From Stratch: chosen") elif (sessiontype == 'b'): SessionTypeBool = False TrainBatchBool = False OptimBool = False EpochBool = False valtrain = 1 epochval = 1 print("Testing: chosen") #UNCOMMENT FOR CONTINUE TRAINING OPTION Uncomment and use at your own risk! """ elif (sessiontype == 'c'): SessionTypeBool = False print ("Testing: chosen") """ #------------------------------------Epoch Selection--------------------------------------# while (EpochBool): epoch = input(" Number of Epochs: ") try: epochval = int(epoch) print(f'\nEpochs chosen: {epochval}') EpochBool = False except ValueError: print("Please input a valid Epochs input") EpochBool = True # This section is DEVELOPER USE ONLY. We do not want the user to change the training or test batch numbers # as this can lead to CUDA out of memory errors. Uncomment and use at your own risk! """ #------------------------------------Optimiser Selection---------------------------------# while (OptimBool): optimiseinput = input(" Optimizer (Debug): \n a.Adam \n b.SGD \n >") if (optimiseinput != 'a' and optimiseinput != 'b'): print ("Please input a valid Optimizer input") OptimBool = True if (optimiseinput == 'a'): optimizer = optim.Adam(model.parameters(), lr=0.001) print ("Adam chosen") OptimBool = False elif (optimiseinput == 'b'): optimizer = optim.SGD(model.parameters(), lr=0.001) print ("SGD chosen") OptimBool = False #------------------------------------Batch Selection---------------------------------# while (TrainBatchBool): trainbatch = input(" Number of train batchs (Debug): ") try: valtrain = int(trainbatch) print(f'\ntraining batchs chosen: {valtrain}') TrainBatchBool = False except ValueError: print ("Please input a valid batchs input") TrainBatchBool = True while (TestBatchBool): testbatch = input(" Number of test batchs (Debug): ") try: valtest = int(testbatch) print(f'\ntest batchs chosen: {valtest}') TestBatchBool = False except ValueError: print ("Please input a valid batchs input") TestBatchBool = True """ #------------------------------------Job Menu---------------------------------------# job = jobclass(sessiontype, model, modeldict, optimizer, epochval, device, valtrain, valtest, modelname) joblist.append(job) while (jobBool): finish = input( " Would you like to run another Model after? y/n: ") if (finish != 'y' and finish != 'n'): print("Please input a valid job input") jobBool = True if (finish == 'y'): jobBool = False print("Add another job") if (finish == 'n'): jobBool = False exit = True print("Jobs Executing") return joblist
# net = resnet50() #net = resnet18() # 重写网络最后一层 #fc_in_features = net.fc.in_features # 网络最后一层的输入通道 #net.fc = nn.Linear(in_features=fc_in_features, out_features=cfg.num_classes) # 将网络结构、损失函数放置在GPU上;配置优化器 net = net.to(cfg.device) # net = nn.DataParallel(net, device_ids=[0, 1]) # criterion=nn.BCELoss() #criterion = nn.BCEWithLogitsLoss().cuda(device=cfg.device) criterion = nn.CrossEntropyLoss().cuda(device=cfg.device) # 常规优化器:随机梯度下降和Adam #optimizer = optim.SGD(params=net.parameters(), lr=cfg.learning_rate, # weight_decay=cfg.weight_decay, momentum=cfg.momentum) optimizer = optim.Adam(params=net.parameters(), lr=cfg.learning_rate, weight_decay=cfg.weight_decay) # 线性学习率优化器 #optimizer = optim.SGD(params=net.parameters(), lr=cfg.learning, # weight_decay=cfg.weight_decay, momentum=cfg.momentum) # --------------进行训练----------------- # print('进行训练....') # train_and_valid_(net, criterion=criterion, # optimizer=optimizer, # train_loader=train_loader, # valid_loader=valid_loader, cfg=cfg, # is_lr_warmup=False, is_lr_adjust=False) # -------------进行测试----------------- print('进行测试.....')
def train(data_train, data_val, num_classes, num_epoch, milestones): model = AlexNet(num_classes, pretrain=False) device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") model = model.to(device) criterion = nn.CrossEntropyLoss() optimizer = optim.Adam(model.parameters(), lr=0.0001) lr_scheduler = MultiStepLR(optimizer, milestones=milestones, gamma=0.1) since = time.time() best_acc = 0 best = 0 for epoch in range(num_epoch): print('Epoch {}/{}'.format(epoch + 1, num_epoch)) print('-' * 10) # Iterate over data. running_loss = 0.0 running_corrects = 0 model.train() with torch.set_grad_enabled(True): for i, (inputs, labels) in enumerate(data_train): inputs = inputs.to(device) labels = labels.to(device) outputs = model(inputs) _, preds = torch.max(outputs, 1) loss = criterion(outputs, labels) optimizer.zero_grad() loss.backward() optimizer.step() running_loss += loss.item() running_corrects += torch.sum(preds == labels.data) * 1. / inputs.size(0) print("\rIteration: {}/{}, Loss: {}.".format(i + 1, len(data_train), loss.item()), end="") sys.stdout.flush() avg_loss = running_loss / len(data_train) t_acc = running_corrects.double() / len(data_train) running_loss = 0.0 running_corrects = 0 model.eval() with torch.set_grad_enabled(False): for i, (inputs, labels) in enumerate(data_val): inputs = inputs.to(device) labels = labels.to(device) outputs = model(inputs) _, preds = torch.max(outputs, 1) loss = criterion(outputs, labels) running_loss += loss.item() running_corrects += torch.sum(preds == labels.data) * 1. / inputs.size(0) val_loss = running_loss / len(data_val) val_acc = running_corrects.double() / len(data_val) print() print('Train Loss: {:.4f} Acc: {:.4f}'.format(avg_loss, t_acc)) print('Val Loss: {:.4f} Acc: {:.4f}'.format(val_loss, val_acc)) print('lr rate: {:.6f}'.format(optimizer.param_groups[0]['lr'])) print() if val_acc > best_acc: best_acc = val_acc best = epoch + 1 lr_scheduler.step() time_elapsed = time.time() - since print('Training complete in {:.0f}m {:.0f}s'.format( time_elapsed // 60, time_elapsed % 60)) print('Best Validation Accuracy: {}, Epoch: {}'.format(best_acc, best)) return model
def main(): parser = argparse.ArgumentParser() parser.add_argument('--stage', default='train', type=str) parser.add_argument('--dataset', default='imagenet', type=str) parser.add_argument('--lr', default=0.0012, type=float) parser.add_argument('--batch_size', default=128, type=int) parser.add_argument('--gpus', default='0,1,2,3', type=str) parser.add_argument('--weight_decay', default=1e-5, type=float) parser.add_argument('--max_epoch', default=30, type=int) parser.add_argument('--lr_decay_steps', default='15,20,25', type=str) parser.add_argument('--exp', default='', type=str) parser.add_argument('--list', default='', type=str) parser.add_argument('--resume_path', default='', type=str) parser.add_argument('--pretrain_path', default='', type=str) parser.add_argument('--n_workers', default=32, type=int) parser.add_argument('--network', default='resnet50', type=str) global args args = parser.parse_args() if not os.path.exists(args.exp): os.makedirs(args.exp) if not os.path.exists(os.path.join(args.exp, 'runs')): os.makedirs(os.path.join(args.exp, 'runs')) if not os.path.exists(os.path.join(args.exp, 'models')): os.makedirs(os.path.join(args.exp, 'models')) if not os.path.exists(os.path.join(args.exp, 'logs')): os.makedirs(os.path.join(args.exp, 'logs')) # logger initialize logger = getLogger(args.exp) device_ids = list(map(lambda x: int(x), args.gpus.split(','))) device = torch.device('cuda: 0') train_loader, val_loader = cifar.get_semi_dataloader( args) if args.dataset.startswith( 'cifar') else imagenet.get_semi_dataloader(args) # create model if args.network == 'alexnet': network = AlexNet(128) elif args.network == 'alexnet_cifar': network = AlexNet_cifar(128) elif args.network == 'resnet18_cifar': network = ResNet18_cifar() elif args.network == 'resnet50_cifar': network = ResNet50_cifar() elif args.network == 'wide_resnet28': network = WideResNet(28, args.dataset == 'cifar10' and 10 or 100, 2) elif args.network == 'resnet18': network = resnet18() elif args.network == 'resnet50': network = resnet50() network = nn.DataParallel(network, device_ids=device_ids) network.to(device) classifier = nn.Linear(2048, 1000).to(device) # create optimizer parameters = network.parameters() optimizer = torch.optim.SGD( parameters, lr=args.lr, momentum=0.9, weight_decay=args.weight_decay, ) cls_optimizer = torch.optim.SGD( classifier.parameters(), lr=args.lr * 50, momentum=0.9, weight_decay=args.weight_decay, ) cudnn.benchmark = True # create memory_bank global writer writer = SummaryWriter(comment='SemiSupervised', logdir=os.path.join(args.exp, 'runs')) # create criterion criterion = nn.CrossEntropyLoss() logging.info(beautify(args)) start_epoch = 0 if args.pretrain_path != '' and args.pretrain_path != 'none': logging.info('loading pretrained file from {}'.format( args.pretrain_path)) checkpoint = torch.load(args.pretrain_path) state_dict = checkpoint['state_dict'] valid_state_dict = { k: v for k, v in state_dict.items() if k in network.state_dict() and 'fc.' not in k } for k, v in network.state_dict().items(): if k not in valid_state_dict: logging.info('{}: Random Init'.format(k)) valid_state_dict[k] = v # logging.info(valid_state_dict.keys()) network.load_state_dict(valid_state_dict) else: logging.info('Training SemiSupervised Learning From Scratch') logging.info('start training') best_acc = 0.0 try: for i_epoch in range(start_epoch, args.max_epoch): train(i_epoch, network, classifier, criterion, optimizer, cls_optimizer, train_loader, device) checkpoint = { 'epoch': i_epoch + 1, 'state_dict': network.state_dict(), 'optimizer': optimizer.state_dict(), } torch.save(checkpoint, os.path.join(args.exp, 'models', 'checkpoint.pth')) adjust_learning_rate(args.lr_decay_steps, optimizer, i_epoch) if i_epoch % 2 == 0: acc1, acc5 = validate(i_epoch, network, classifier, val_loader, device) if acc1 >= best_acc: best_acc = acc1 torch.save(checkpoint, os.path.join(args.exp, 'models', 'best.pth')) writer.add_scalar('acc1', acc1, i_epoch + 1) writer.add_scalar('acc5', acc5, i_epoch + 1) if i_epoch in [30, 60, 120, 160, 200]: torch.save( checkpoint, os.path.join(args.exp, 'models', '{}.pth'.format(i_epoch + 1))) logging.info( colorful('[Epoch: {}] val acc: {:.4f}/{:.4f}'.format( i_epoch, acc1, acc5))) logging.info( colorful('[Epoch: {}] best acc: {:.4f}'.format( i_epoch, best_acc))) with torch.no_grad(): for name, param in network.named_parameters(): if 'bn' not in name: writer.add_histogram(name, param, i_epoch) # cluster except KeyboardInterrupt as e: logging.info('KeyboardInterrupt at {} Epochs'.format(i_epoch)) exit()
def main(): # Init logger if not os.path.isdir(args.save_path): os.makedirs(args.save_path) log = open(os.path.join(args.save_path, 'log_seed_{}.txt'.format(args.manualSeed)), 'w') print_log('save path : {}'.format(args.save_path), log) state = {k: v for k, v in args._get_kwargs()} print_log(state, log) print_log("Random Seed: {}".format(args.manualSeed), log) print_log("python version : {}".format(sys.version.replace('\n', ' ')), log) print_log("torch version : {}".format(torch.__version__), log) print_log("cudnn version : {}".format(torch.backends.cudnn.version()), log) # Data loading code # Any other preprocessings? http://pytorch.org/audio/transforms.html sample_length = 10000 scale = transforms.Scale() padtrim = transforms.PadTrim(sample_length) downmix = transforms.DownmixMono() transforms_audio = transforms.Compose([ scale, padtrim, downmix ]) if not os.path.isdir(args.data_path): os.makedirs(args.data_path) train_dir = os.path.join(args.data_path, 'train') val_dir = os.path.join(args.data_path, 'val') #Choose dataset to use if args.dataset == 'arctic': # TODO No ImageFolder equivalent for audio. Need to create a Dataset manually train_dataset = Arctic(train_dir, transform=transforms_audio, download=True) val_dataset = Arctic(val_dir, transform=transforms_audio, download=True) num_classes = 4 elif args.dataset == 'vctk': train_dataset = dset.VCTK(train_dir, transform=transforms_audio, download=True) val_dataset = dset.VCTK(val_dir, transform=transforms_audio, download=True) num_classes = 10 elif args.dataset == 'yesno': train_dataset = dset.YESNO(train_dir, transform=transforms_audio, download=True) val_dataset = dset.YESNO(val_dir, transform=transforms_audio, download=True) num_classes = 2 else: assert False, 'Dataset is incorrect' train_loader = torch.utils.data.DataLoader( train_dataset, batch_size=args.batch_size, shuffle=True, num_workers=args.workers, # pin_memory=True, # What is this? # sampler=None # What is this? ) val_loader = torch.utils.data.DataLoader( val_dataset, batch_size=args.batch_size, shuffle=False, num_workers=args.workers, pin_memory=True) #Feed in respective model file to pass into model (alexnet.py) print_log("=> creating model '{}'".format(args.arch), log) # Init model, criterion, and optimizer # net = models.__dict__[args.arch](num_classes) net = AlexNet(num_classes) # print_log("=> network :\n {}".format(net), log) # net = torch.nn.DataParallel(net, device_ids=list(range(args.ngpu))) # define loss function (criterion) and optimizer criterion = torch.nn.CrossEntropyLoss() # Define stochastic gradient descent as optimizer (run backprop on random small batch) optimizer = torch.optim.SGD(net.parameters(), state['learning_rate'], momentum=state['momentum'], weight_decay=state['decay'], nesterov=True) #Sets use for GPU if available if args.use_cuda: net.cuda() criterion.cuda() recorder = RecorderMeter(args.epochs) # optionally resume from a checkpoint # Need same python vresion that the resume was in if args.resume: if os.path.isfile(args.resume): print_log("=> loading checkpoint '{}'".format(args.resume), log) if args.ngpu == 0: checkpoint = torch.load(args.resume, map_location=lambda storage, loc: storage) else: checkpoint = torch.load(args.resume) recorder = checkpoint['recorder'] args.start_epoch = checkpoint['epoch'] net.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) print_log("=> loaded checkpoint '{}' (epoch {})" .format(args.resume, checkpoint['epoch']), log) else: print_log("=> no checkpoint found at '{}'".format(args.resume), log) else: print_log("=> do not use any checkpoint for {} model".format(args.arch), log) if args.evaluate: validate(val_loader, net, criterion, 0, log, val_dataset) return # Main loop start_time = time.time() epoch_time = AverageMeter() # Training occurs here for epoch in range(args.start_epoch, args.epochs): current_learning_rate = adjust_learning_rate(optimizer, epoch, args.gammas, args.schedule) need_hour, need_mins, need_secs = convert_secs2time(epoch_time.avg * (args.epochs-epoch)) need_time = '[Need: {:02d}:{:02d}:{:02d}]'.format(need_hour, need_mins, need_secs) print_log('\n==>>{:s} [Epoch={:03d}/{:03d}] {:s} [learning_rate={:6.4f}]'.format(time_string(), epoch, args.epochs, need_time, current_learning_rate) \ + ' [Best : Accuracy={:.2f}, Error={:.2f}]'.format(recorder.max_accuracy(False), 100-recorder.max_accuracy(False)), log) print("One epoch") # train for one epoch # Call to train (note that our previous net is passed into the model argument) train_acc, train_los = train(train_loader, net, criterion, optimizer, epoch, log, train_dataset) # evaluate on validation set #val_acc, val_los = extract_features(test_loader, net, criterion, log) val_acc, val_los = validate(val_loader, net, criterion, epoch, log, val_dataset) is_best = recorder.update(epoch, train_los, train_acc, val_los, val_acc) save_checkpoint({ 'epoch': epoch + 1, 'arch': args.arch, 'state_dict': net.state_dict(), 'recorder': recorder, 'optimizer' : optimizer.state_dict(), }, is_best, args.save_path, 'checkpoint.pth.tar') # measure elapsed time epoch_time.update(time.time() - start_time) start_time = time.time() recorder.plot_curve( os.path.join(args.save_path, 'curve.png') ) log.close()
P += (pred_cls == input_label_tensor).sum().cpu().detach().numpy() N += HyperParams["batch_size"] if idx % 500 == 499: print("|acc:%f|use time:%s|" % (float(P / N), str(time.time() - start_time))) start_time = time.time() # print('') if __name__ == '__main__': train_data = mnist.MNIST("./mnist_data") model = AlexNet(10) if HyperParams["cuda"]: model = model.cuda() optimer = torch.optim.Adam(params=[{ "params": model.parameters() }], lr=0.004) lr_sch = torch.optim.lr_scheduler.MultiStepLR(optimer, [1, 2, 3, 4], 0.1) criterion = torch.nn.CrossEntropyLoss() static_params = torch.load("./%s_E%d.snap" % (HyperParams["model_save_prefix"], 4)) model.load_state_dict(static_params) # trainval(model,optimer,lr_sch,criterion,train_data) if HyperParams["quantize"]: model = torch.quantization.quantize_dynamic(model) torch.save(model.state_dict(), "./quantize_mode.snap") test(model, train_data)