def main(): start_epoch = args.start_epoch # start from epoch 0 or last checkpoint epoch # Data print('==> Preparing dataset %s' % args.dataset) transform_train = transforms.Compose([ transforms.RandomCrop(32, padding=4), transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)), ]) transform_test = transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)), ]) if args.dataset == 'cifar10': dataloader = datasets.CIFAR10 num_classes = 10 else: dataloader = datasets.CIFAR100 num_classes = 100 trainset = dataloader(root=args.dataroot, train=True, download=True, transform=transform_train) sampler = torch.utils.data.distributed.DistributedSampler(trainset,num_replicas=hvd.size(), rank=hvd.rank()) trainloader = data.DataLoader(dataset=trainset, batch_size=args.train_batch * world_size, shuffle=False, sampler=sampler) testset = dataloader(root=args.dataroot, train=False, download=False, transform=transform_test) testloader = data.DataLoader(testset, batch_size=args.test_batch * world_size, shuffle=False, num_workers=args.workers) # Model print("==> creating model '{}'".format("Alexnet")) model = AlexNet(num_classes=num_classes) device = torch.device('cuda', local_rank) model = model.to(device) # model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[local_rank], output_device=local_rank) print('Model on cuda:%d' % local_rank) print(' Total params: %.2fM' % (sum(p.numel() for p in model.parameters())/1000000.0)) criterion = nn.CrossEntropyLoss() optimizer = optim.SGD(model.parameters(), lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay) # 用horovod封装优化器 optimizer = hvd.DistributedOptimizer(optimizer, named_parameters=model.named_parameters()) # 广播参数 hvd.broadcast_parameters(model.state_dict(), root_rank=0) # Train and val for epoch in range(start_epoch, args.epochs): adjust_learning_rate(optimizer, epoch) train_loss, train_acc = train(trainloader, model, criterion, optimizer, epoch, use_cuda) test_loss, test_acc = test(testloader, model, criterion, epoch, use_cuda) print('Rank:{} Epoch[{}/{}]: LR: {:.3f}, Train loss: {:.5f}, Test loss: {:.5f}, Train acc: {:.2f}, Test acc: {:.2f}.'.format(local_rank,epoch+1, args.epochs, state['lr'], train_loss, test_loss, train_acc, test_acc))
def train(pertrained=False, resume_file=None): if pertrained: from model import alexnet net = alexnet(pretrained=True, num_classes=NUMBER_CLASSES) else: from model import AlexNet net = AlexNet(num_classes=NUMBER_CLASSES) valid_precision = 0 policies = net.parameters() optimizer = optim.SGD(policies, lr=LR, momentum=MOMENTUM, weight_decay=WEIGHT_DECAY) train_log = open( "logs/train_logs_{}.log".format( time.strftime("%Y-%m-%d-%H:%M:%S", time.localtime())), "w") valid_log = open( "logs/valid_logs_{}.log".format( time.strftime("%Y-%m-%d-%H:%M:%S", time.localtime())), "w") train_log.write("{}\t{}\t{}\n".format("epoch", "losses ", "correct")) valid_log.write("{}\t{}\t{}\n".format("epoch", "losses ", "correct")) # 恢复训练 if resume_file: if os.path.isfile(resume_file): print(("=> loading checkpoint '{}'".format(resume_file))) checkpoint = torch.load(resume_file) start_epoch = checkpoint['epoch'] net.load_state_dict(checkpoint['model_state_dict']) print(("=> loaded checkpoint '{}' (epoch {})".format( resume_file, checkpoint['epoch']))) else: start_epoch = 0 print(("=> no checkpoint found at '{}'".format(resume_file))) # valid_precision = valid(net) for epoch in range(start_epoch, EPOCHES): batch_time = AverageMeter() data_time = AverageMeter() losses = AverageMeter() correct = AverageMeter() end = time.time() optimizer = adjust_learning_rate(optimizer, epoch, LR, LR_steps, WEIGHT_DECAY) for i_batch, sample_batched in enumerate(train_dataloader): # measure data loading time data_time.update(time.time() - end) inputs, labels = sample_batched if CUDA_AVALIABLE: outputs = net.forward(inputs.cuda()) labels = labels.long().flatten().cuda() else: outputs = net.forward(inputs) labels = labels.long().flatten() outputs = outputs.reshape([-1, NUMBER_CLASSES]) loss = criterion(outputs, labels) # 更新统计数据 losses.update(loss.item(), inputs.size(0)) _, predicted = torch.max(outputs.data, 1) # 计算准确率 correct.update( (predicted == labels.long()).sum().item() / len(labels), inputs.size(0)) optimizer.zero_grad() loss.backward() optimizer.step() # measure elapsed time batch_time.update(time.time() - end) end = time.time() if i_batch % 10 == 0: print(('Epoch: [{0}][{1}/{2}], lr: {lr:.5f}\t' 'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t' 'Data {data_time.val:.3f} ({data_time.avg:.3f})\t' 'Loss {loss.val:.4f} ({loss.avg:.4f})\t' 'Prec@1 {top1.val:.3f} ({top1.avg:.3f})\t'.format( epoch, i_batch, len(train_dataloader), batch_time=batch_time, data_time=data_time, loss=losses, top1=correct, lr=optimizer.param_groups[-1]['lr']))) train_log.write("{:5d}\t{:.5f}\t{:.5f}\n".format( epoch, losses.avg, correct.avg)) train_log.flush() if epoch % 1 == 0: valid_precision = valid(net, epoch, valid_log) # 保存网络 if (epoch > 0 and epoch % 10 == 0) or epoch == EPOCHES - 1: save_path = os.path.join( "models", "{:d}_{}_{:d}_{:d}_{:.5f}.pt".format(int(time.time()), "alexnet", epoch, BATCHSIZE, valid_precision)) print("[INFO] Save weights to " + save_path) torch.save( { 'epoch': epoch, 'model_state_dict': net.state_dict(), 'optimizer_state_dir': optimizer.state_dict, 'loss': loss }, save_path) train_log.close() valid_log.close()
loss = loss_function(outputs, labels) running_loss += loss loss.backward() optimizer.step() rate = index / train_data_loader.__len__() a = "*" * int(rate * 50) b = "." * int((1 - rate) * 50) print("\rtrain loss: {:^3.0f}%[{}->{}]{:.3f}".format( int(rate * 100), a, b, loss), end="") print("\n time.perf_counter()-t1") model.eval() acc = 0.0 with torch.no_grad(): for data in valid_data_loader: imgs, labels = data outputs = model(imgs) acc += (torch.max(outputs, dim=1)[1] == labels).sum().item() acc = acc / valid_data_loader.dataset.__len__() if acc > best_acc: best_acc = acc print("Saving Model") torch.save(model.state_dict(), 'AlexNet_weights.pth') torch.save(model, 'AlexNet.pth') print('[epoch %d] train_loss: %.3f test_accuracy: %.3f' % (epoch + 1, running_loss, acc)) print('Finished Training')
def main(): device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") #指定设备 print("using {} device.".format(device)) data_transform = { #数据预处理 "train": transforms.Compose([transforms.RandomResizedCrop(224),# key 为trian 返回这些方法 随机裁剪 224*224 transforms.RandomHorizontalFlip(),#随机反转 transforms.ToTensor(),#转成 transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))]),#标准化处理 "val": transforms.Compose([transforms.Resize((224, 224)), # cannot 224, must (224, 224) transforms.ToTensor(), transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))])} data_root = os.path.abspath(os.path.join(os.getcwd(), "../..")) # get data root path image_path = os.path.join(data_root, "data_set", "dog_data") # flower data set path assert os.path.exists(image_path), "{} path does not exist.".format(image_path) train_dataset = datasets.ImageFolder(root=os.path.join(image_path, "train"), transform=data_transform["train"])#数据预处理 train_num = len(train_dataset) #个数 # {'daisy':0, 'dandelion':1, 'roses':2, 'sunflower':3, 'tulips':4} flower_list = train_dataset.class_to_idx #获取名称所对应索引 cla_dict = dict((val, key) for key, val in flower_list.items()) #遍历 key value 对调 # write dict into json file json_str = json.dumps(cla_dict, indent=4) with open('class_indices.json', 'w') as json_file:#生成json 便于打开 json_file.write(json_str) batch_size = 32 nw = min([os.cpu_count(), batch_size if batch_size > 1 else 0, 8]) # number of workers print('Using {} dataloader workers every process'.format(nw)) train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=nw) #加载 validate_dataset = datasets.ImageFolder(root=os.path.join(image_path, "val"), transform=data_transform["val"]) val_num = len(validate_dataset) validate_loader = torch.utils.data.DataLoader(validate_dataset, batch_size=4, shuffle=False, num_workers=nw) print("using {} images for training, {} images fot validation.".format(train_num, val_num)) # test_data_iter = iter(validate_loader) # test_image, test_label = test_data_iter.next() # # def imshow(img): # img = img / 2 + 0.5 # unnormalize # npimg = img.numpy() # plt.imshow(np.transpose(npimg, (1, 2, 0))) # plt.show() # # print(' '.join('%5s' % cla_dict[test_label[j].item()] for j in range(4))) # imshow(utils.make_grid(test_image)) net = AlexNet(num_classes=5, init_weights=True) #类别5 net.to(device) #网络设备 loss_function = nn.CrossEntropyLoss() #损失函数 # pata = list(net.parameters()) optimizer = optim.Adam(net.parameters(), lr=0.0002) #adam优化器 对象是网络中可训练参数 学习率 自己调参 save_path = './AlexNet.pth' #保存模型路径 best_acc = 0.0 for epoch in range(10):#训练 # train net.train() #管理神经元失活 running_loss = 0.0 #统计平均损失 t1 = time.perf_counter() #训练时间 for step, data in enumerate(train_loader, start=0): #遍历数据集 images, labels = data #分为图像 标签 optimizer.zero_grad() #清空梯度信息 outputs = net(images.to(device)) #正向传播 指定设备 loss = loss_function(outputs, labels.to(device)) #损失 loss.backward() #反向传播 optimizer.step() #更新结点参数 # print statistics running_loss += loss.item() #损失累加 # print train process rate = (step + 1) / len(train_loader) #打印训练进度 a = "*" * int(rate * 50) b = "." * int((1 - rate) * 50) print("\rtrain loss: {:^3.0f}%[{}->{}]{:.3f}".format(int(rate * 100), a, b, loss), end="") print() print(time.perf_counter()-t1) # validate net.eval() #关闭失活 acc = 0.0 # accumulate accurate number / epoch with torch.no_grad(): for val_data in validate_loader: val_images, val_labels = val_data outputs = net(val_images.to(device)) predict_y = torch.max(outputs, dim=1)[1] #最大就是类别 acc += (predict_y == val_labels.to(device)).sum().item() #预测与真实对比 累加 val_accurate = acc / val_num #准确率 if val_accurate > best_acc: #如果准确率大于历史最优 best_acc = val_accurate #更新 torch.save(net.state_dict(), save_path) #保存权重 print('[epoch %d] train_loss: %.3f test_accuracy: %.3f' % #打印信息 (epoch + 1, running_loss / step, val_accurate)) print('Finished Training')
def train(args): device = torch.device(f"cuda:{args.device_id}") model = AlexNet(n_cls=100, useLRN=args.useLRN, useDropOut=args.useDropOut) # model = AlexNet(num_classes= 100) criterion = nn.CrossEntropyLoss() model.to(device) optimizer = Adam(model.parameters(), lr=args.lr) train_loader, valid_loader = getLoaders(split="train", batch_size=args.batch_size, num_workers=args.num_workers, aug=args.useAug) train_loss_arr = [] valid_loss_arr = [] valid_acc_arr = [] valid_top5_arr = [] n_iter = 0 best_loss = float('inf') best_top1_acc = 0 best_top5_acc = 0 for ep in range(args.epoch): model.train() for _, (img, label) in tqdm(enumerate(train_loader), total=len(train_loader)): img, label = img.to(device), label.to(device) optimizer.zero_grad() pred = model(img) loss = criterion(pred, label) # loss = model.criterion(pred, label) loss.backward() optimizer.step() train_loss_arr.append(loss.item()) n_iter += 1 model.eval() ep_valid_loss_arr = [] ep_acc_arr = [] ep_top5_arr = [] with torch.no_grad(): for _, (img, label) in tqdm(enumerate(valid_loader), total=len(valid_loader)): img, label = img.to(device), label.to(device) pred = model(img) loss = criterion(pred, label) # loss = model.criterion(pred, label) acc = utils.top_k_acc(k=1, pred=pred.detach().cpu().numpy(), label=label.detach().cpu().numpy()) acc5 = utils.top_k_acc(k=5, pred=pred.detach().cpu().numpy(), label=label.detach().cpu().numpy()) ep_acc_arr.append(acc) ep_top5_arr.append(acc5) ep_valid_loss_arr.append(loss.item()) valid_loss = np.mean(ep_valid_loss_arr) valid_acc = np.mean(ep_acc_arr) valid_top5 = np.mean(ep_top5_arr) train_loss = np.mean(train_loss_arr[-len(train_loader):]) valid_loss_arr.append(valid_loss) if valid_loss < best_loss: best_loss = valid_loss best_top1_acc = valid_acc best_top5_acc = valid_top5 model.cpu() torch.save(model.state_dict(), "best_model.pth") model.to(device) if (ep + 1) % 10 == 0: model.cpu() torch.save( { "model": model.state_dict(), "optimizer": optimizer.state_dict(), "train_loss": train_loss_arr, "valid_loss": valid_loss_arr, "valid_acc": valid_acc_arr, "valid_top5": valid_top5_arr, "best_loss": best_loss, "ep": ep, "n_iter": n_iter, }, "model_checkpoint.pth") model.to(device) print( f"[{ep}, {n_iter}] train: {train_loss:.4f}, valid: {valid_loss:.4f}, acc: {valid_acc:.4f}, top5: {valid_top5:.4f}" ) with open("exp_result.txt", "a+") as f: f.write( f"{args}, loss: {best_loss:.4f}, top1: {best_top1_acc*100:.1f}, top5: {best_top5_acc*100:.1f}\n" )
def main(): device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu') print("using {} device.".format(device)) tbwriter = SummaryWriter(log_dir="./logs") data_transform = { "train": transforms.Compose([ transforms.RandomResizedCrop(360), transforms.ToTensor(), transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)) ]), "val": transforms.Compose([ transforms.Resize(360, 360), # cannot 360, must (360,360) transforms.ToTensor(), transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)) ]) } data_root = os.path.abspath(os.path.join(os.getcwd(), "./DATA")) # get data root path image_path = os.path.join(data_root, "male") # flower data set path assert os.path.exists(image_path), "{} path does not exist.".format( image_path) train_dataset = datasets.ImageFolder(root=os.path.join( image_path, "train"), transform=data_transform["train"]) train_num = len(train_dataset) flower_list = train_dataset.class_to_idx cla_dict = dict((val, key) for key, val in flower_list.items()) # write dict into json file json_str = json.dumps(cla_dict, indent=2) with open('class_indices.json', 'w') as json_file: json_file.write(json_str) batch_size = 8 nw = min([os.cpu_count(), batch_size if batch_size > 1 else 0, 8]) # number of workers print('Using {} dataloader workers every process'.format(nw)) train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=nw) validate_dataset = datasets.ImageFolder(root=os.path.join( image_path, "val"), transform=data_transform["val"]) val_num = len(validate_dataset) validate_loader = torch.utils.data.DataLoader(validate_dataset, batch_size=8, shuffle=True, num_workers=nw) print("using {} images for training, {} images fot validation.".format( train_num, val_num)) if os.path.exists("./log360.pth"): net = AlexNet() #net.load_state_dict(torch.load("./log360.pth", map_location='cuda:2')) net = torch.load("./log360.pth", 'cpu') print("continue training") else: net = AlexNet(num_classes=3, init_weights=True) net.to(device) print("start training anew") loss_function = nn.CrossEntropyLoss() optimizer = optim.Adam(net.parameters(), lr=0.0001) scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=30, gamma=0.98) epochs = 2000 save_path = './AlexNet.pth' best_acc = 0.0 train_steps = len(train_loader) #json_path = './class_indices.json' #json_file = open(json_path, "r") #class_indict = json.load(json_file) #model = AlexNet(num_classed=6).to(device) trainLOSS = [] #save loss testLOSS = [] #save loss valACC = [] #save val acc for epoch in range(epochs): scheduler.step() print('LR:{}'.format(scheduler.get_lr()[0])) # train net.train() running_loss = 0.0 train_bar = tqdm(train_loader) for step, data in enumerate(train_bar): images, labels = data optimizer.zero_grad() outputs = net(images.to(device)) loss = loss_function(outputs, labels.to(device)) loss.backward() optimizer.step() # print statistics running_loss += loss.item() train_bar.desc = "train epoch[{}/{}] loss:{:.3f}".format( epoch + 1, epochs, loss) # validate net.eval() acc = 0.0 # accumulate accurate number / epoch with torch.no_grad(): val_bar = tqdm(validate_loader, colour='green') for val_data in val_bar: val_images, val_labels = val_data outputs = net(val_images.to(device)) predict_y = torch.max(outputs, dim=1)[1] acc += torch.eq(predict_y, val_labels.to(device)).sum().item() val_accurate = acc / val_num tbwriter.add_scalar('train/loss', running_loss / train_steps, epoch) tbwriter.add_scalar('val/acc', val_accurate, epoch) trainLOSS.append(running_loss / train_steps) valACC.append(val_accurate) print('[epoch %d] train_loss: %.3f val_accuracy: %.3f' % (epoch + 1, running_loss / train_steps, val_accurate)) print(' ') if val_accurate > best_acc: best_acc = val_accurate torch.save(net.state_dict(), save_path) #predict #weights_path="./AlexNet.pth" #model.load_state_dict(torch.load(weights_path)) #model.eval() #with torch.no_grad(): # putput = torch.squeeze(model(img.to(device))).cpu() # predict = torch.softmax(output, dim=0) # predict_cla = torch.argmax(predict.numpy) npLOSS = np.array(trainLOSS) npVALACC = np.array(valACC) np.save('./save/loss_epoch_{}'.format(epoch), npLOSS) np.save('./save/valacc_epoch_{}'.format(epoch), npVALACC) print('Finished Training')
if cuda: xv, yv = xv.cuda(), yv.cuda() v_feature, v_score, v_pred = model.forward(xv) v_pred_label = torch.max(v_score, 1)[1] v_equal = torch.eq(v_pred_label, yv).float() zeros = zeros.scatter_add(0, yv, v_equal) zeros_classes = zeros_classes.scatter_add( 0, yv, torch.ones_like(yv, dtype=torch.float)) v_correct += torch.sum(v_equal).item() v_sum += len(yv) v_acc = v_correct / v_sum output('validation: {}, {}'.format(v_correct, v_acc, zeros)) output('class: {}'.format(zeros.tolist())) output('class: {}'.format(zeros_classes.tolist())) output('source: {}, target: {}, batch_size: {}, init_lr: {}'.format( s_name, t_name, batch_size, init_lr)) output('lr_mult: {}, lr_mult_D: {}'.format(lr_mult, lr_mult_D)) output(' ======= START TRAINING ======= ') # save model if epoch % 1000 == 0 and epoch != 0: torch.save( { 'epoch': epoch + 1, 'model': model.state_dict(), 'opt': opt.state_dict(), 'opt_D': opt_D.state_dict() }, checkpoint_save_path) epoch += 1
def main(): # 设置运行设备 device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") print("using {} device.".format(device)) # 数据处理 data_transform = { "train": transforms.Compose([ transforms.RandomResizedCrop(224), transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)) ]), "val": transforms.Compose([ transforms.Resize((224, 224)), transforms.ToTensor(), transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)) ]) } # 存放train与val的路径 image_path = '/home/xulei/数据集大本营/5_flower_data/flower_data' # flower data root path # 若该目录不存在,在报错并终止程序 assert os.path.exists(image_path), "{} path does not exist.".format( image_path) # 定义训练数据集 train_dataset = datasets.ImageFolder(root=os.path.join( image_path, "train"), transform=data_transform["train"]) # 训练数据集的文件数量 train_num = len(train_dataset) # flower_list: {'daisy':0, 'dandelion':1, 'roses':2, 'sunflower':3, 'tulips':4} flower_list = train_dataset.class_to_idx # cla_dict : {0: 'daisy', 1: 'dandelion', 2: 'roses', 3: 'sunflowers', 4: 'tulips'} cla_dict = dict((val, key) for key, val in flower_list.items()) # write dict into json file # 要输出json格式,需要对json数据进行编码,要用到函数:json.dumps # indent=4, 的作用是让字典的内容逐行显示,每个key占一行 # json_str : # '{ # "0": "daisy", # "1": "dandelion", # "2": "roses", # "3": "sunflowers", # "4": "tulips" # }' json_str = json.dumps(cla_dict, indent=4) with open('class_idices.json', 'w') as json_file: json_file.write(json_str) batch_size = 128 nw = min(os.cpu_count(), batch_size if batch_size > 1 else 0, 8) # number of workers nw: 8 ????? print("using {} dataloader workers every process".format(nw)) train_loader = datas.DataLoader(train_dataset, batch_size, shuffle=True, num_workers=nw) validate_dataset = datasets.ImageFolder(root=os.path.join( image_path, "val"), transform=data_transform["val"]) # val_num: 364 val_num = len(validate_dataset) validate_loader = datas.DataLoader(validate_dataset, batch_size, shuffle=False, num_workers=nw) print("using {} images for trainning, {} images for validation.".format( train_num, val_num)) net = AlexNet(num_classes=5).to(device) loss_function = nn.CrossEntropyLoss() optimizer = optim.Adam(net.parameters(), lr=0.00004) epoches = 20 save_path = './AlexNet.pth' best_acc = 0.0 # train_steps : 26 len(train_loader)= training_images_num/batch_size train_steps = len(train_loader) for epoch in range(epoches): net.train() running_loss = 0.0 train_bar = tqdm(train_loader) # 进度条 for step, data in enumerate(train_bar): images, labels = data optimizer.zero_grad() outputs = net(images.to(device)) loss = loss_function(outputs, labels.to(device)) loss.backward() optimizer.step() # print statistics running_loss += loss.item() train_bar.desc = "train epoch[{}/{}] loss:{:.3f}".format( epoch + 1, epoches, loss) # validata net.eval() acc = 0.0 # accumulate accurate number / epoch with torch.no_grad(): val_bar = tqdm(validate_loader) # , colour='green' for val_data in val_bar: val_images, val_labels = val_data outputs = net(val_images.to(device)) predict_y = torch.max(outputs, dim=1)[1] acc += torch.eq(predict_y, val_labels.to(device)).sum().item() val_accurate = acc / val_num print('\n[epoch %d] train_loss: %.3f val_accuracy: %.3f' % (epoch + 1, running_loss / train_steps, val_accurate)) if val_accurate > best_acc: best_acc = val_accurate torch.save(net.state_dict(), save_path) print("Finshed Training")
train_loss = [] for epoch in range(num_epochs): for idx, (inputs, labels) in enumerate(train_loader): inputs = inputs.to(device) labels = labels.to(device) outputs = model(inputs) loss = criterion(outputs, labels) optimizer.zero_grad() loss.backward() optimizer.step() train_loss.append(loss.item()) if ((idx + 1) % 100 == 0): print("epoch is {}/{} Step is: {}/{} loss is: {}".format( epoch, num_epochs, idx, num_batches, loss.item())) model.eval() with torch.no_grad(): correct = 0 total = 0 for idx, (inputs, labels) in enumerate(test_loader): inputs = inputs.to(device) labels = labels.to(device) preds = model(inputs) values, indices = torch.max(preds, 1) total += labels.shape[0] correct += (labels == indices).sum().item() print("Accuracy of the network is: {}%".format(100 * correct / total)) torch.save(model.state_dict(), 'model.pth')
print() with open(os.path.join("train.log"), "a") as log: log.write(str('%f s' % (time.perf_counter() - time_start)) + "\n") print('%f s' % (time.perf_counter() - time_start)) ########################################### validate ########################################### net.eval() # 验证过程中关闭 Dropout acc = 0.0 with torch.no_grad(): for val_data in validate_loader: val_images, val_labels = val_data outputs = net(val_images.to(device)) predict_y = torch.max(outputs, dim=1)[1] # 以output中值最大位置对应的索引(标签)作为预测输出 acc += (predict_y == val_labels.to(device)).sum().item() val_accurate = acc / val_num # 保存准确率最高的那次网络参数 if val_accurate > best_acc: best_acc = val_accurate torch.save(net.state_dict(), save_path) with open(os.path.join("train.log"), "a") as log: log.write( str('[epoch %d] train_loss: %.3f test_accuracy: %.3f \n' % (epoch + 1, running_loss / step, val_accurate)) + "\n") print('[epoch %d] train_loss: %.3f test_accuracy: %.3f \n' % (epoch + 1, running_loss / step, val_accurate)) with open(os.path.join("train.log"), "a") as log: log.write(str('Finished Training') + "\n") print('Finished Training')
class Solver(object): def __init__(self, config): self.model = None self.lr = config.lr self.epochs = config.epoch self.train_batch_size = config.trainBatchSize self.test_batch_size = config.testBatchSize self.criterion = None self.optimizer = None self.scheduler = None self.device = None self.cuda = config.cuda self.train_loader = None self.test_loader = None self.is_board = False def load_data(self): train_transform = transforms.Compose( [transforms.RandomHorizontalFlip(), transforms.ToTensor()]) test_transform = transforms.Compose([transforms.ToTensor()]) train_set = torchvision.datasets.CIFAR10( root='/mnt/disk50/datasets/cifar', train=True, download=True, transform=train_transform) self.train_loader = torch.utils.data.DataLoader( dataset=train_set, batch_size=self.train_batch_size, shuffle=True) test_set = torchvision.datasets.CIFAR10( root='/mnt/disk50/datasets/cifar', train=False, download=True, transform=test_transform) self.test_loader = torch.utils.data.DataLoader( dataset=test_set, batch_size=self.test_batch_size, shuffle=False) def load_model_from_pth(self, model_path): """Load the pre-trained model weight :param model_path: :return: """ checkpoint = torch.load(model_path, map_location=self.device_name)['model'] # TODO:这里需要具体了解原因在哪里? checkpoint_parameter_name = list(checkpoint.keys())[0] model_parameter_name = next(self.model.named_parameters())[0] is_checkpoint = checkpoint_parameter_name.startswith('module.') is_model = model_parameter_name.startswith('module.') if is_checkpoint and not is_model: # 移除checkpoint模型里面参数 new_parameter_check = OrderedDict() for key, value in checkpoint.items(): if key.startswith('module.'): new_parameter_check[key[7:]] = value self.model.load_state_dict(new_parameter_check) elif not is_checkpoint and is_model: # 添加module.参数 new_parameter_dict = OrderedDict() for key, value in checkpoint.items(): if not key.startswith('module.'): key = 'module.' + key new_parameter_dict[key] = value else: self.model.load_state_dict(checkpoint) return self.model def load_model(self): if self.cuda: self.device = torch.device('cuda:0') cudnn.benchmark = True else: self.device = torch.device('cpu') # self.model = LeNet().to(self.device) self.model = AlexNet().to(self.device) self.optimizer = optim.Adam(self.model.parameters(), lr=self.lr) self.scheduler = optim.lr_scheduler.MultiStepLR(self.optimizer, milestones=[75, 150], gamma=0.5) self.criterion = nn.CrossEntropyLoss().to(self.device) def train(self, writer=None): print("train:") self.model.train() train_loss = 0 train_correct = 0 total = 0 for batch_num, (data, target) in enumerate(self.train_loader): data, target = data.to(self.device), target.to(self.device) self.optimizer.zero_grad() output = self.model(data) loss = self.criterion(output, target) loss.backward() self.optimizer.step() train_loss += loss.item() prediction = torch.max( output, 1) # second param "1" represents the dimension to be reduced total += target.size(0) # train_correct incremented by one if predicted right train_correct += np.sum( prediction[1].cpu().numpy() == target.cpu().numpy()) progress_bar( batch_num, len(self.train_loader), 'Loss: %.4f | Acc: %.3f%% (%d/%d)' % (train_loss / (batch_num + 1), 100. * train_correct / total, train_correct, total)) # if not writer: # writer.add_scalar return train_loss, train_correct / total def test(self): print("test:") self.model.eval() test_loss = 0 test_correct = 0 total = 0 start = time.time() with torch.no_grad(): for batch_num, (data, target) in enumerate(self.test_loader): data, target = data.to(self.device), target.to(self.device) output = self.model(data) loss = self.criterion(output, target) test_loss += loss.item() prediction = torch.max(output, 1) total += target.size(0) test_correct += np.sum( prediction[1].cpu().numpy() == target.cpu().numpy()) progress_bar( batch_num, len(self.test_loader), 'Loss: %.4f | Acc: %.3f%% (%d/%d)' % (test_loss / (batch_num + 1), 100. * test_correct / total, test_correct, total)) end = time.time() time_used = end - start return test_loss, test_correct / total, time_used def save(self): model_out_path = "./best_model_new.pkl" torch.save(self.model.state_dict(), model_out_path) print("Checkpoint saved to {}".format(model_out_path)) def run(self): self.load_data() self.load_model() # for k, v in self.model.state_dict(): # print('layer{}'.k) # print(v) accuracy = 0 writer = SummaryWriter() for epoch in range(1, self.epochs + 1): self.scheduler.step(epoch) print("\n===> epoch: %d/200" % epoch) train_loss, train_acc = self.train() test_loss, test_acc = self.test() # writer.add_scalar('loss_group',{'train_loss':train_loss.numpy(), # 'test_loss':test_loss.numpy()},epoch) # writer.add_scalar('acc_group',{'train_acc':train_acc.numpy(), # 'test_acc':test_acc.numpy()}, epoch) if test_acc > accuracy: accuracy = test_acc self.save() elif epoch == self.epochs: print("===> BEST ACC. PERFORMANCE: %.3f%%" % (accuracy * 100)) self.save()
def main(): device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") print("using {} device.".format(device)) batch_size = 16 epochs = 20 data_transform = { "train": transforms.Compose([ transforms.RandomResizedCrop(224), transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]) ]), "val": transforms.Compose([ transforms.Resize(256), transforms.CenterCrop(224), transforms.ToTensor(), transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]) ]) } data_root = os.path.abspath(os.path.join(os.getcwd(), ".")) # get data root path image_path = os.path.join(data_root, "data_set", "flower_data") # flower data set path assert os.path.exists(image_path), "{} path does not exist.".format( image_path) train_dataset = datasets.ImageFolder(root=os.path.join( image_path, "train"), transform=data_transform["train"]) train_num = len(train_dataset) # {'daisy':0, 'dandelion':1, 'roses':2, 'sunflower':3, 'tulips':4} flower_list = train_dataset.class_to_idx cla_dict = dict((val, key) for key, val in flower_list.items()) # write dict into json file json_str = json.dumps(cla_dict, indent=4) with open('class_indices.json', 'w') as json_file: json_file.write(json_str) nw = min([os.cpu_count(), batch_size if batch_size > 1 else 0, 8]) # number of workers print('Using {} dataloader workers every process'.format(nw)) train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=nw) validate_dataset = datasets.ImageFolder(root=os.path.join( image_path, "val"), transform=data_transform["val"]) val_num = len(validate_dataset) validate_loader = torch.utils.data.DataLoader(validate_dataset, batch_size=batch_size, shuffle=False, num_workers=nw) print("using {} images for training, {} images for validation.".format( train_num, val_num)) # create model net = AlexNet(num_classes=5) net.to(device) # define loss function loss_function = nn.CrossEntropyLoss() # construct an optimizer params = [p for p in net.parameters() if p.requires_grad] optimizer = optim.Adam(params, lr=0.0001) best_acc = 0.0 save_path = 'weights/alexnet.pth' train_steps = len(train_loader) for epoch in range(epochs): # train net.train() running_loss = 0.0 train_bar = tqdm(train_loader) for step, data in enumerate(train_bar): images, labels = data optimizer.zero_grad() logits = net(images.to(device)) loss = loss_function(logits, labels.to(device)) loss.backward() optimizer.step() # print statistics running_loss += loss.item() train_bar.desc = "train epoch[{}/{}] loss:{:.3f}".format( epoch + 1, epochs, loss) # validate net.eval() acc = 0.0 # accumulate accurate number / epoch with torch.no_grad(): val_bar = tqdm(validate_loader) for val_data in val_bar: val_images, val_labels = val_data outputs = net(val_images.to(device)) # loss = loss_function(outputs, test_labels) predict_y = torch.max(outputs, dim=1)[1] acc += torch.eq(predict_y, val_labels.to(device)).sum().item() val_bar.desc = "valid epoch[{}/{}]".format(epoch + 1, epochs) val_accurate = acc / val_num print('[epoch %d] train_loss: %.3f val_accuracy: %.3f' % (epoch + 1, running_loss / train_steps, val_accurate)) if val_accurate > best_acc: best_acc = val_accurate torch.save(net.state_dict(), save_path) print('Finished Training')
if __name__ == "__main__": # init seed value seed = torch.initial_seed() # TensorboardX tbwriter = SummaryWriter(log_dir=LOG_DIR) print("TensorboardX summary writer created") # create model alexnet = AlexNet(num_classes=NUM_CLASSES) # load pretrained model if pretrained: alexnet_dict = alexnet.state_dict() # print(alexnet_dict.keys()) alexnet_pretrained = models.alexnet(pretrained=True) pretrained_dict = alexnet_pretrained.state_dict() pretrained_dict = {k: v for k, v in pretrained_dict.items() if k in alexnet_dict} # print(pretrained_dict.keys()) pretrained_dict.pop("classifier.6.weight") pretrained_dict.pop("classifier.6.bias") alexnet_dict.update(pretrained_dict) alexnet.load_state_dict(alexnet_dict) # print(alexnet_dict.keys()) print("Load from pretrained") # Freeze parameter if freeze_layer: for name, value in alexnet.named_parameters():
def train(): try: os.makedirs(opt.checkpoints_dir) except OSError: pass if torch.cuda.device_count() > 1: model = torch.nn.parallel.DataParallel( AlexNet(num_classes=opt.num_classes)) else: model = AlexNet(num_classes=opt.num_classes) if os.path.exists(MODEL_PATH): model.load_state_dict( torch.load(MODEL_PATH, map_location=lambda storage, loc: storage)) model.to(device) ################################################ # Set loss function and Adam optimizer ################################################ criterion = torch.nn.CrossEntropyLoss() optimizer = optim.Adam(model.parameters(), lr=opt.lr) for epoch in range(opt.epochs): # train for one epoch print(f"\nBegin Training Epoch {epoch + 1}") # Calculate and return the top-k accuracy of the model # so that we can track the learning process. losses = AverageMeter() top1 = AverageMeter() top5 = AverageMeter() for i, data in enumerate(train_dataloader): # get the inputs; data is a list of [inputs, labels] inputs, targets = data inputs = inputs.to(device) targets = targets.to(device) # compute output output = model(inputs) loss = criterion(output, targets) # measure accuracy and record loss prec1, prec5 = accuracy(output, targets, topk=(1, 2)) losses.update(loss.item(), inputs.size(0)) top1.update(prec1, inputs.size(0)) top5.update(prec5, inputs.size(0)) # compute gradients in a backward pass optimizer.zero_grad() loss.backward() # Call step of optimizer to update model params optimizer.step() print( f"Epoch [{epoch + 1}] [{i + 1}/{len(train_dataloader)}]\t" f"Loss {loss.item():.4f}\t" f"Prec@1 {top1.val:.3f} ({top1.avg:.3f})\t" f"Prec@5 {top5.val:.3f} ({top5.avg:.3f})", end="\r") # save model file torch.save(model.state_dict(), MODEL_PATH)
def main(): device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") print(device) data_transform = { "train": transforms.Compose([ transforms.RandomResizedCrop(224), transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)) ]), "val": transforms.Compose([ transforms.Resize((224, 224)), # cannot 224, must (224, 224) transforms.ToTensor(), transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)) ]) } data_root = os.path.abspath(os.path.join(os.getcwd(), "./")) image_path = os.path.join(data_root, "flower_data") train_dataset = datasets.ImageFolder(root=image_path + "/train", transform=data_transform['train']) train_num = len(train_dataset) # {'daisy':0, 'dandelion':1, 'roses':2, 'sunflower':3, 'tulips':4} flower_list = train_dataset.class_to_idx cla_dict = dict((val, key) for key, val in flower_list.items()) # write dict into json file json_str = json.dumps(cla_dict, indent=4) with open('class_indices.json', 'w') as json_file: json_file.write(json_str) batch_size = 8 train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=0) validate_dataset = datasets.ImageFolder(root=image_path + "/val", transform=data_transform["val"]) val_num = len(validate_dataset) validate_loader = torch.utils.data.DataLoader(validate_dataset, batch_size=4, shuffle=True, num_workers=0) # test_data_iter = iter(validate_loader) # test_image, test_label = test_data_iter.next() # # def imshow(img): # img = img / 2 + 0.5 # unnormalize # npimg = img.numpy() # plt.imshow(np.transpose(npimg, (1, 2, 0))) # plt.show() # # print(' '.join('%5s' % cla_dict[test_label[j].item()] for j in range(4))) # imshow(utils.make_grid(test_image)) net = AlexNet(num_class=5) print(net) net.to(device) loss_function = nn.CrossEntropyLoss() # pata = list(net.parameters()) optimizer = optim.Adam(net.parameters(), lr=0.0002) save_path = './AlexNet.pth' best_acc = 0.0 for epoch in range(10): # train net.train() running_loss = 0.0 t1 = time.perf_counter() for step, data in enumerate(train_loader, start=0): images, labels = data optimizer.zero_grad() outputs = net(images.to(device)) loss = loss_function(outputs, labels.to(device)) loss.backward() optimizer.step() # print statistics running_loss += loss.item() # print train process rate = (step + 1) / len(train_loader) a = "*" * int(rate * 50) b = "." * int((1 - rate) * 50) print("\rtrain loss: {:^3.0f}%[{}->{}]{:.3f}".format( int(rate * 100), a, b, loss), end="") print() print(time.perf_counter() - t1) # validate net.eval() acc = 0.0 # accumulate accurate number / epoch # 验证过程中不计算损失梯度 with torch.no_grad(): for val_data in validate_loader: val_images, val_labels = val_data outputs = net(val_images.to(device)) predict_y = torch.max(outputs, dim=1)[1] acc += (predict_y == val_labels.to(device)).sum().item() val_accurate = acc / val_num if val_accurate > best_acc: best_acc = val_accurate torch.save(net.state_dict(), save_path) print('[epoch %d] train_loss: %.3f test_accuracy: %.3f' % (epoch + 1, running_loss / step, val_accurate)) print('Finished Training')
def main(): device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") print("using {} device.".format(device)) data_transform = { "train": transforms.Compose([ transforms.RandomResizedCrop(224), transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)) ]), "val": transforms.Compose([ transforms.Resize((224, 224)), # cannot 224, must (224, 224) transforms.ToTensor(), transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)) ]) } data_root = os.path.abspath(os.path.join(os.getcwd(), "../..")) # get data root path image_path = os.path.join(data_root, "data_set", "flower_data") # flower data set path assert os.path.exists(image_path), "{} path does not exist.".format( image_path) train_dataset = datasets.ImageFolder(root=os.path.join( image_path, "train"), transform=data_transform["train"]) train_num = len(train_dataset) # {'daisy':0, 'dandelion':1, 'roses':2, 'sunflower':3, 'tulips':4} flower_list = train_dataset.class_to_idx cla_dict = dict((val, key) for key, val in flower_list.items()) # write dict into json file json_str = json.dumps(cla_dict, indent=4) with open('class_indices.json', 'w') as json_file: json_file.write(json_str) batch_size = 32 nw = min([os.cpu_count(), batch_size if batch_size > 1 else 0, 8]) # number of workers print('Using {} dataloader workers every process'.format(nw)) train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=nw) validate_dataset = datasets.ImageFolder(root=os.path.join( image_path, "val"), transform=data_transform["val"]) val_num = len(validate_dataset) validate_loader = torch.utils.data.DataLoader(validate_dataset, batch_size=4, shuffle=False, num_workers=nw) print("using {} images for training, {} images for validation.".format( train_num, val_num)) # test_data_iter = iter(validate_loader) # test_image, test_label = test_data_iter.next() # # def imshow(img): # img = img / 2 + 0.5 # unnormalize # npimg = img.numpy() # plt.imshow(np.transpose(npimg, (1, 2, 0))) # plt.show() # # print(' '.join('%5s' % cla_dict[test_label[j].item()] for j in range(4))) # imshow(utils.make_grid(test_image)) net = AlexNet(num_classes=5, init_weights=True) net.to(device) loss_function = nn.CrossEntropyLoss() # pata = list(net.parameters()) optimizer = optim.Adam(net.parameters(), lr=0.0002) epochs = 10 save_path = './AlexNet.pth' best_acc = 0.0 train_steps = len(train_loader) for epoch in range(epochs): # train net.train() running_loss = 0.0 train_bar = tqdm(train_loader) for step, data in enumerate(train_bar): images, labels = data optimizer.zero_grad() outputs = net(images.to(device)) loss = loss_function(outputs, labels.to(device)) loss.backward() optimizer.step() # print statistics running_loss += loss.item() train_bar.desc = "train epoch[{}/{}] loss:{:.3f}".format( epoch + 1, epochs, loss) # validate net.eval() acc = 0.0 # accumulate accurate number / epoch with torch.no_grad(): val_bar = tqdm(validate_loader) for val_data in val_bar: val_images, val_labels = val_data outputs = net(val_images.to(device)) predict_y = torch.max(outputs, dim=1)[1] acc += torch.eq(predict_y, val_labels.to(device)).sum().item() val_accurate = acc / val_num print('[epoch %d] train_loss: %.3f val_accuracy: %.3f' % (epoch + 1, running_loss / train_steps, val_accurate)) if val_accurate > best_acc: best_acc = val_accurate torch.save(net.state_dict(), save_path) print('Finished Training')
def main(): device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") # torch.device规定训练中所使用的设备 print("using {} device.".format(device)) data_transform = { # data_transform数据预处理 "train": transforms.Compose([ transforms.RandomResizedCrop(224), # 随机裁剪为224*224 transforms.RandomHorizontalFlip(), # 水平方向随机翻转 transforms.ToTensor(), # 转化为tensor transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)) ]), # 标准化处理 "val": transforms.Compose([ transforms.Resize((224, 224)), # * cannot 224, must (224, 224) transforms.ToTensor(), transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)) ]) } print(os.getcwd()) # data_root = os.path.abspath(os.path.join(os.getcwd(), "../..")) # get data root path # 先获取数据集所在的根目录os.getcwd() # ^ os.getcwd() 返回当前进程的工作目录,并非当前文件所在的目录 # "../.."表示的是上两层目录,这个要看具体的情况,这是一个相对路径的写法 # ^ os.path.join 路径拼接,拼接后得到的就是当前目录的上两级目录 # ^ os.path.abspath() 获取指定文件或目录的绝对路径(完整路径) data_root = os.path.abspath(os.getcwd()) image_path = os.path.join(data_root, "data_set", "flower_data") # flower data set path # 等价于 image_path = data_root + "data_set/flower_data" # assert os.path.exists(image_path), "{} path does not exist.".format(image_path) train_dataset = datasets.ImageFolder( root=os.path.join(image_path, "train"), # 下载数据集 ,"train"表示是训练集数据 transform=data_transform["train"]) # 使用"train"的预处理方式 train_num = len(train_dataset) # 查看训练集有多少张图片 # {'daisy':0, 'dandelion':1, 'roses':2, 'sunflower':3, 'tulips':4} flower_list = train_dataset.class_to_idx # * .class_to_idx 得到分类名称对应的索引 cla_dict = dict( (val, key) for key, val in flower_list.items()) # * 将刚刚字典的键值对 变为 值键对 # write dict into json file json_str = json.dumps(cla_dict, indent=4) # 将刚刚的字典变为json形式 with open('class_indices.json', 'w') as json_file: json_file.write(json_str) batch_size = 32 nw = min([os.cpu_count(), batch_size if batch_size > 1 else 0, 8]) # number of workers print('Using {} dataloader workers every process'.format(nw)) train_loader = torch.utils.data.DataLoader( train_dataset, # 加载数据集 batch_size=batch_size, shuffle=True, # 通过batchsize和随机参数从样本中获取一批批数据 num_workers=nw) # wins下num_workers一般设置为0,linux下num_workers设置可以分布式计算 validate_dataset = datasets.ImageFolder( root=os.path.join( image_path, "val" ), # root=os.path.join(image_path, "val")等价于 root=image_path+"val" transform=data_transform["val"]) val_num = len(validate_dataset) validate_loader = torch.utils.data.DataLoader( validate_dataset, batch_size=batch_size, shuffle=False, # batch_size=4, shuffle=True, num_workers=nw) print("using {} images for training, {} images for validation.".format( train_num, val_num)) # 下面是查看数据集的demo # 注意,第60行的batch_size=4, shuffle=True再查看: # test_data_iter = iter(validate_loader) # test_image, test_label = test_data_iter.next() # def imshow(img): # img = img / 2 + 0.5 # unnormalize # npimg = img.numpy() # plt.imshow(np.transpose(npimg, (1, 2, 0))) # plt.show() # print(' '.join('%5s' % cla_dict[test_label[j].item()] for j in range(4))) # imshow(utils.make_grid(test_image)) net = AlexNet(num_classes=5, init_weights=True) # 5个类别的花数据集,初始化权重为True # 实例化模型对象 net net.to(device) # ^ net.to(device)将网络放入刚刚指定的设备中 loss_function = nn.CrossEntropyLoss() # 定义损失函数,多类别的交叉熵函数 # pata = list(net.parameters()) # 调试所用,查看模型的参数 optimizer = optim.Adam( net.parameters(), lr=0.0002) # 定义Adam优化器,优化对象是网络中所有的可训练参数net.parameters(),以及学习了lr=0.0002 epochs = 10 save_path = './AlexNet.pth' # 保存权重的路径 best_acc = 0.0 # 最佳准确率 best_acc,首先初始化为0,后面再更新 train_steps = len(train_loader) for epoch in range(epochs): # 迭代10次 # * 因为使用了dropout,只在训练中使用,预测中不使用 # train # & 训练阶段 net.train() # 调用net.train()进入训练阶段,同时使用 dropout 方法 running_loss = 0.0 # 统计训练中的平均损失 train_bar = tqdm(train_loader) # 为了统计训练一个epoch所需时间 for step, data in enumerate(train_bar): # 遍历数据集;数据集分为图像和标签 images, labels = data optimizer.zero_grad() # 梯度清0 outputs = net( images.to(device)) # 正向传播,图像放入设备中,然后实例化AlexNet的网络net中 loss = loss_function( outputs, labels.to(device)) # 计算损失,计算预测值与真实值的损失,这里label也要放入设备中 loss.backward() # 反向传播到每一个节点 optimizer.step() # 更新每一个节点的参数 # print statistics running_loss += loss.item() # 累加loss值 train_bar.desc = "train epoch[{}/{}] loss:{:.3f}".format( epoch + 1, epochs, loss) # 为了或者训练进度 # validate # & 测试阶段 net.eval() # 调用net.eval() 进入测试阶段,同时关闭 dropout 方法 acc = 0.0 # accumulate accurate number / epoch with torch.no_grad(): # * with torch.no_grad() 禁止参数跟踪:验证中不计算损失梯度 val_bar = tqdm(validate_loader) for val_data in val_bar: val_images, val_labels = val_data # 数据划分为图片和对应的标签 outputs = net( val_images.to(device)) # 放入网络net中得到输出,输出的维度是 [batch, 10] predict_y = torch.max( outputs, dim=1 )[1] # 求出输出的第1个维度(dim=1类别维度)max(只关注最大值对应的位置[1],不关心数值 ),得到预测值 predict_y acc += torch.eq(predict_y, val_labels.to( device)).sum().item() # 统计预测正确的个数 # ^ 通过.item()得到相应的数值 # acc += (predict_y == val_labels.to(device)).sum().item() # 等价的 val_accurate = acc / val_num # 累加的准确率除以样本个数,得到平均准确率 print('[epoch %d] train_loss: %.3f val_accuracy: %.3f' % (epoch + 1, running_loss / train_steps, val_accurate)) if val_accurate > best_acc: # 如果当前准确率大于历史最优准确率 best_acc = val_accurate # 更新 torch.save(net.state_dict(), save_path) print('Finished Training')