def train_epoch(self): current_loss = 0 train_accuracy = 0 epoch_start = time.time() for step, sample_batched in enumerate(self.train_dataloader): with torch.set_grad_enabled(True): output = self.model(sample_batched) loss = self.loss(output, self.get_targets(sample_batched["label"])) self.optimizer.zero_grad() loss.backward() self.optimizer.step() current_loss += loss.detach() train_accuracy += get_acc( output, self.get_targets(sample_batched["label"])) if step % self.train_log_frequency == 0 and step > 0: current_loss /= self.train_log_frequency * self.train_batch_size train_accuracy /= self.train_log_frequency * self.train_batch_size self.writer.add_scalar( "Train/loss", current_loss, (self.epoch - 1) * len(self.train_dataloader) + step, ) self.writer.add_scalar( "Train/accuracy", train_accuracy, (self.epoch - 1) * len(self.train_dataloader) + step, ) print( f"> epoch: {self.epoch} | step: {step} | loss: {loss} | train_accuracy: {train_accuracy} | epoch_training_time: {time.time() - epoch_start} s" ) current_loss = 0 train_accuracy = 0 print( f"=> Train epoch {self.epoch} finished in {time.time() - epoch_start} s" )
def validate(self): validation_start = time.time() validation_accuracy = 0 self.model.eval() # Set model to evaluate mode # Iterate over data. for step, sample_batched in enumerate(self.dev_dataloader): with torch.set_grad_enabled(False): output = self.model(sample_batched) validation_accuracy += get_acc( output, self.get_targets_dev(sample_batched["label"])) validation_accuracy /= len(self.dev_dataloader) * self.dev_batch_size self.writer.add_scalar("Validation/accuracy", validation_accuracy, self.epoch) print( f"=> Validation epoch {self.epoch} | validation_accuracy: {validation_accuracy} | validation_time: {time.time() - validation_start} ms" ) if validation_accuracy > self.best_validation_accuracy: torch.save(self.model, f"{self.save_dir}/{self.epoch}_siamese.pth") self.best_validation_accuracy = validation_accuracy print("====> New best model saved!")
def main(mode=None): global name, logger #Tag_ResidualBlocks_BatchSize name = "my_log" logger = SummaryWriter("runs/" + name) cat_dir = "D:/Codewyf/AI/data/datasets/test/cat_test/" dog_dir = "D:/Codewyf/AI/data/datasets/test/dog_test/" config = load_config(mode) torch.manual_seed(config.SEED) #为CPU设计种子用于生成随机数,以使得结果是确定的 torch.cuda.manual_seed(config.SEED) #为GPU设置随机种子,可以保证每次初始化相同 np.random.seed(config.SEED) random.seed(config.SEED) train_set = ImageFolder(config.TRAIN_PATH, transform=train_tf) #设置训练路径 length_train = len(train_set) #return the number of items in a container train_data = torch.utils.data.DataLoader(train_set, batch_size=config.BATCH_SIZE, shuffle=True) # iter_per_epoch = len(train_data) #return the number of per epoch test_set = ImageFolder(config.TEST_PATH, transform=test_tf) length_test = len(test_set) test_data = torch.utils.data.DataLoader(test_set, batch_size=config.BATCH_SIZE, shuffle=True) cat_test_set = ImageFolder(cat_dir, transform=test_tf) length_test = len(test_set) cat_test_data = torch.utils.data.DataLoader(test_set, batch_size=config.BATCH_SIZE, shuffle=True) dog_test_set = ImageFolder(dog_dir, transform=test_tf) length_test = len(test_set) dog_test_data = torch.utils.data.DataLoader(test_set, batch_size=config.BATCH_SIZE, shuffle=True) # INIT GPU初始化GPU os.environ['CUDA_VISIBLE_DEVICES'] = ','.join(str(e) for e in config.GPU) if torch.cuda.is_available(): config.DEVICE = torch.device("cuda") print('\nGPU IS AVAILABLE') torch.backends.cudnn.benchmark = True else: config.DEVICE = torch.device("cpu") # choose network选择一个网络 net = resnet18().to(config.DEVICE) #使用resnet18 print('The Model is ResNet18\n') # optimizer and loss function 优化和损失函数 optimizer = optim.SGD( net.parameters(), lr=config.LR, momentum=0.9, weight_decay=5e-4) #Stochastic Gradient Descent随机梯度下降 loss_function = nn.CrossEntropyLoss() #交叉熵损失函数 # warmup train_scheduler = optim.lr_scheduler.MultiStepLR( optimizer, milestones=config.MILESTONES, gamma=0.5) #调整学习率learning rate # milestons是数组,gamma是倍数,LR初始值为0.01,当milestones达到所设置的3,6,9时,lr的数值乘以gamma,即倍数 warmup_scheduler = WarmUpLR(optimizer, iter_per_epoch * config.WARM) # create checkpoint folder to save model model_path = os.path.join(config.PATH, 'model') if not os.path.exists(model_path): os.mkdir(model_path) checkpoint_path = os.path.join(model_path, '{epoch}-{type}.pth') best_acc = 0.0 a = config.EPOCH for epoch in range(1, config.EPOCH): if epoch > config.WARM: train_scheduler.step(epoch) ### train ### net.train() #在训练前加上 train_loss = 0.0 # cost function error train_correct = 0.0 for i, data in enumerate(train_data): steps = len(train_data) * (epoch - 1) + i #计算训练到了第多少步 if epoch <= config.WARM: warmup_scheduler.step() length = len(train_data) image, label = data image, label = image.to(config.DEVICE), label.to(config.DEVICE) output = net(image) train_correct += get_acc(output, label) loss = loss_function(output, label) train_loss += loss.item() # backward optimizer.zero_grad() #把梯度置零,也就是把loss关于weight的导数变成0 loss.backward() optimizer.step() #设置每多少个epoch输出一次损失 if i % 2 == 0: train_loss_log = train_loss / (i + 1) train_correct_log = train_correct / (i + 1) logger.add_scalar('train_loss', train_loss_log, steps) logger.add_scalar('train_acc', train_correct_log, steps) print( 'Training Epoch: {epoch} [{trained_samples}/{total_samples}]\tLoss: {:0.4f}\tAcc: {:0.4f}\tLR: {:0.6f}' .format(train_loss / (i + 1), train_correct / (i + 1), optimizer.param_groups[0]['lr'], epoch=epoch, trained_samples=i * config.BATCH_SIZE + len(image), total_samples=length_train)) # start to save best performance model 保存当前训练的最佳的模型 acc = test_correct / (i + 1) if epoch > config.MILESTONES[1] and best_acc < acc: torch.save(net.state_dict(), checkpoint_path.format(epoch=epoch, type='best')) best_acc = acc continue if not epoch % config.SAVE_EPOCH: torch.save(net.state_dict(), checkpoint_path.format(epoch=epoch, type='regular')) ### eval ### net.eval() #在测试前使用 test_loss = 0.0 # cost function error test_correct = 0.0 for i, data in enumerate(test_data): #测试刚刚训练的epoch的准确率 images, labels = data images, labels = images.to(config.DEVICE), labels.to(config.DEVICE) outputs = net(images) loss = loss_function(outputs, labels) test_loss += loss.item() test_correct += get_acc(outputs, labels) print( 'Testing: [{test_samples}/{total_samples}]\tAverage loss: {:.4f}, Accuracy: {:.4f}' .format(test_loss / (i + 1), test_correct / (i + 1), test_samples=i * config.BATCH_SIZE + len(images), total_samples=length_test)) logger.add_scalar('test_loss', test_loss / (i + 1), epoch) logger.add_scalar('test_acc', test_correct / (i + 1), epoch) #eval net.eval() test_loss = 0.0 test_correct = 0.0 for i, data in enumerate(cat_test_data): images, labels = data images, labels = images.to(config.DEVICE), labels.to(config.DEVICE) outputs = net(images) loss = loss_function(outputs, labels) test_loss += loss.item() test_correct += get_acc(outputs, labels) logger.add_scalar('test_loss_cat', test_loss / (i + 1), epoch) logger.add_scalar('test_acc_cat', test_correct / (i + 1), epoch) #eval net.eval() test_loss = 0.0 test_correct = 0.0 for i, data in enumerate(dog_test_data): images, labels = data images, labels = images.to(config.DEVICE), labels.to(config.DEVICE) ouputs = net(images) loss = loss_function(outputs, labels) test_loss += loss.item() test_correct += get_acc(outputs, labels) logger.add_scalar('test_loss_dog', test_loss / (i + 1), epoch) logger.add_scalar('test_acc_dog', test_correct / (i + 1), epoch) print()
def main(mode=None): config = load_config(mode) torch.manual_seed(config.SEED) torch.cuda.manual_seed(config.SEED) np.random.seed(config.SEED) random.seed(config.SEED) train_set = ImageFolder(config.TRAIN_PATH, transform=train_tf) length1 = len(train_set) train_data = torch.utils.data.DataLoader(train_set, batch_size=config.BATCH_SIZE, shuffle=True) iter_per_epoch = len(train_data) test_set = ImageFolder(config.TEST_PATH, transform=test_tf) test_data = torch.utils.data.DataLoader(test_set, batch_size=config.BATCH_SIZE, shuffle=False) # INIT GPU os.environ['CUDA_VISIBLE_DEVICES'] = ','.join(str(e) for e in config.GPU) if torch.cuda.is_available(): config.DEVICE = torch.device("cuda") print('\nGPU IS AVAILABLE') torch.backends.cudnn.benchmark = True else: config.DEVICE = torch.device("cpu") # choose network if config.MODEL == 1: net = VGG16().to(config.DEVICE) print('The Model is VGG\n') if config.MODEL == 2: net = resnet34().to(config.DEVICE) print('The Model is ResNet34\n') if config.MODEL == 3: net = mobilenet().to(config.DEVICE) print('The Model is mobilenet\n') if config.MODEL == 4: net = shufflenet().to(config.DEVICE) print('The Model is shufflenet\n') # print(dir(net)) # # choose train or test # if config.MODE == 1: # print("Start Training...\n") # net.train() # if config.MODE == 2: # print("Start Testing...\n") # net.test() optimizer = optim.SGD(net.parameters(), lr=config.LR, momentum=0.9, weight_decay=5e-4) loss_function = nn.CrossEntropyLoss() train_scheduler = optim.lr_scheduler.MultiStepLR( optimizer, milestones=config.MILESTONES, gamma=0.2) warmup_scheduler = WarmUpLR(optimizer, iter_per_epoch * config.WARM) # optimizer = optim.Adam(net.parameters(),lr=float(config.LR),betas=(config.BETA1, config.BETA2)) # use tensorboard runs_path = os.path.join(config.PATH, 'runs') if not os.path.exists(runs_path): os.mkdir(runs_path) # writer=SummaryWriter(log_dir=runs_path) # input_tensor = torch.Tensor(12, 3, 32, 32).cuda() # writer.add_graph(net, Variable(input_tensor, requires_grad=True)) #create checkpoint folder to save model model_path = os.path.join(config.PATH, 'model') if not os.path.exists(model_path): os.mkdir(model_path) checkpoint_path = os.path.join(model_path, '{epoch}-{type}.pth') best_acc = 0.0 for epoch in range(1, 100): if epoch > config.WARM: train_scheduler.step(epoch) ### train ### net.train() train_loss = 0.0 # cost function error train_correct = 0.0 for i, data in enumerate(train_data): if epoch <= config.WARM: warmup_scheduler.step() length = len(train_data) image, label = data image, label = image.to(config.DEVICE), label.to(config.DEVICE) output = net(image) train_correct += get_acc(output, label) loss = loss_function(output, label) train_loss += loss.item() # backward optimizer.zero_grad() loss.backward() optimizer.step() print( 'Training Epoch: {epoch} [{trained_samples}/{total_samples}]\tLoss: {:0.4f}\tAcc: {:0.4f}\tLR: {:0.6f}' .format(train_loss / (i + 1), train_correct / (i + 1), optimizer.param_groups[0]['lr'], epoch=epoch, trained_samples=i * 24 + len(image), total_samples=len(train_data.dataset))) ##eval net.eval() test_loss = 0.0 # cost function error test_correct = 0.0 for i, data in enumerate(test_data): images, labels = data images, labels = images.to(config.DEVICE), labels.to(config.DEVICE) outputs = net(images) loss = loss_function(outputs, labels) test_loss += loss.item() test_correct += get_acc(outputs, labels) print( 'Test set: [{test_samples}/{total_samples}]\tAverage loss: {:.4f}, Accuracy: {:.4f}' .format(test_loss / (i + 1), test_correct / (i + 1), test_samples=i * 24 + len(images), total_samples=len(test_data.dataset))) print() acc = test_correct / (i + 1) #start to save best performance model after learning rate decay to 0.01 if epoch > config.MILESTONES[1] and best_acc < acc: torch.save(net.state_dict(), checkpoint_path.format(epoch=epoch, type='best')) best_acc = acc continue if not epoch % config.SAVE_EPOCH: torch.save(net.state_dict(), checkpoint_path.format(epoch=epoch, type='regular'))
callbacks=[lr_decay, cb1], verbose=1) else: datagen.fit(X_train) model.fit_generator(datagen.flow(X_train, y_train, batch_size=batch_size), steps_per_epoch=len(X_train) / batch_size, epochs=epoch, validation_data=(X_test, y_test), callbacks=[lr_decay, cb1], verbose=1) # make confusion matrix preds_train = np.argmax(model.predict(X_train), axis=1) preds_test = np.argmax(model.predict(X_test), axis=1) acc_train = get_acc(y=np.argmax(y_train, axis=1), preds=preds_train) acc_test = get_acc(y=np.argmax(y_test, axis=1), preds=preds_test) save_f_train = 'data/{}/conf_mat/train_{}.png'.format( args.dataset, args.model) save_f_test = 'data/{}/conf_mat/test_{}.png'.format( args.dataset, args.model) make_confusion_matrix( y_row=np.argmax(y_train, axis=1), y_col=preds_train, save_file_name=save_f_train, dataset=args.dataset, title='acc_train : {:.3f}'.format(acc_train) ) make_confusion_matrix(
def main(mode=None): time_now = datetime.now().isoformat() config = load_config(mode) # 随机数种子 torch.manual_seed(config.SEED) torch.cuda.manual_seed(config.SEED) np.random.seed(config.SEED) random.seed(config.SEED) # 记载训练集和测试集 train_set = ImageFolder(config.TRAIN_PATH, transform=train_tf) length_train = len(train_set) train_data=torch.utils.data.DataLoader(train_set,batch_size=config.BATCH_SIZE,shuffle=True) iter_per_epoch = len(train_data) test_set = ImageFolder(config.TEST_PATH, transform=test_tf) length_test = len(test_set) test_data=torch.utils.data.DataLoader(test_set, batch_size=config.BATCH_SIZE, shuffle=True) # INIT GPU os.environ['CUDA_VISIBLE_DEVICES'] = ','.join(str(e) for e in config.GPU) if torch.cuda.is_available(): config.DEVICE = torch.device("cuda") print('\nGPU IS AVAILABLE') torch.backends.cudnn.benchmark = True else: config.DEVICE = torch.device("cpu") # choose network net = VGG16().to(config.DEVICE) print('The Model is VGG16\n') # use tensorboardx if not os.path.exists(config.LOG_DIR): os.mkdir(config.LOG_DIR) writer = SummaryWriter(log_dir=os.path.join( config.LOG_DIR, time_now)) # optimizer and loss function optimizer = optim.SGD(net.parameters(),lr=config.LR, momentum=0.9,weight_decay=5e-4) loss_function = nn.CrossEntropyLoss() # warmup train_scheduler = optim.lr_scheduler.MultiStepLR(optimizer, milestones=config.MILESTONES,gamma=0.5) warmup_scheduler = WarmUpLR(optimizer, iter_per_epoch * config.WARM) # create checkpoint folder to save model model_path = os.path.join(config.PATH,'model') if not os.path.exists(model_path): os.mkdir(model_path) checkpoint_path = os.path.join(model_path,'{epoch}-{type}.pth') best_acc = 0.0 a = config.EPOCH for epoch in range(1, config.EPOCH): if epoch > config.WARM: train_scheduler.step(epoch) ### train ### net.train() train_loss = 0.0 # cost function error train_correct = 0.0 for i, data in enumerate(train_data): if epoch <= config.WARM: warmup_scheduler.step() length = len(train_data) image, label = data image, label = image.to(config.DEVICE),label.to(config.DEVICE) output = net(image) train_correct += get_acc(output, label) loss = loss_function(output, label) train_loss +=loss.item() # backward optimizer.zero_grad() loss.backward() optimizer.step() last_layer = list(net.children())[-1] n_iter = (epoch-1) * iter_per_epoch +i +1 for name, para in last_layer.named_parameters(): if 'weight' in name: writer.add_scalar('LastLayerGradients/grad_norm2_weights', para.grad.norm(), n_iter) if 'bias' in name: writer.add_scalar('LastLayerGradients/grad_norm2_bias', para.grad.norm(), n_iter) print('Training Epoch: {epoch} [{trained_samples}/{total_samples}]\tLoss: {:0.4f}\tAcc: {:0.4f}\tLR: {:0.6f}'.format( train_loss/(i+1), train_correct/(i+1), optimizer.param_groups[0]['lr'], epoch=epoch, trained_samples=i * config.BATCH_SIZE + len(image), total_samples=length_train )) writer.add_scalar('Train/lr',optimizer.param_groups[0]['lr'] , n_iter) writer.add_scalar('Train/loss', (train_loss/(i+1)), n_iter) writer.add_scalar('Train/acc', (train_correct/(i+1)), n_iter) ## eval ### if epoch%1==0: # net.eval() # test_loss = 0.0 # test_correct = 0.0 # for i, data in enumerate(test_data): # images, labels = data # images, labels = images.to(config.DEVICE),labels.to(config.DEVICE) # outputs = net(images) # loss = loss_function(outputs, labels) # test_loss += loss.item() # test_correct += get_acc(outputs, labels) # print('Testing: [{test_samples}/{total_samples}]\tAverage loss: {:.4f}, Accuracy: {:.4f}'.format( # test_loss /(i+1), # test_correct / (i+1), # test_samples=i * config.BATCH_SIZE + len(images), # total_samples=length_test)) # writer.add_scalar('Test/Average loss', (test_loss/(i+1)), n_iter) # writer.add_scalar('Test/Accuracy', (test_correct/(i+1)), n_iter) # print() #start to save best performance model # acc = test_correct/(i+1) # if epoch > config.MILESTONES[1] and best_acc < acc: # torch.save(net.state_dict(), checkpoint_path.format(epoch=epoch, type='best')) # best_acc = acc # continue if not epoch % config.SAVE_EPOCH: torch.save(net.state_dict(), checkpoint_path.format(epoch=epoch, type='regular')) writer.close()
def do_train( cfg, model, metric_fc, train_loader, val_loader, optimizer, lr_schedule, loss_fn, loss_fn2, logger, ): output_dir = cfg.OUTPUT_DIR device = cfg.MODEL.DEVICE epochs = cfg.SOLVER.MAX_EPOCHS lfw_test_list = cfg.LFW_TEST_LIST # device_ids = [0, 1, 2, 3] # model = torch.nn.DataParallel(model, device_ids=device_ids) model.to(device) if metric_fc is not None: metric_fc.to(device) map_dict = read_pkl() for epoch in range(epochs): lr_schedule.step() model.train() # zero the loss train_loss = 0 train_loss1 = 0 train_loss2 = 0 train_acc = 0 label_mse_tensor = torch.tensor([]) for iter, (images, targets) in enumerate(tqdm(train_loader)): iter += 1 images = images.to(device) targets = targets.to(device) # read PEDCC weights tensor_empty = torch.Tensor([]).to(device) for target_index in targets: tensor_empty = torch.cat( (tensor_empty, map_dict[target_index.item()].float().to(device)), 0) label_mse_tensor = tensor_empty.view(-1, 512) label_mse_tensor = label_mse_tensor.to( device) # PEDCC of each class # forward output_ = model(images) output = output_[0] t_loss1 = loss_fn(output, targets) # PEDCC-AMSOFTMAX t_loss2 = loss_fn2(output_[1], label_mse_tensor) t_loss2 = t_loss2**cfg.METRIC.N t_loss = t_loss1 + t_loss2 # backward optimizer.zero_grad() t_loss.backward() optimizer.step() train_loss += t_loss.item() train_loss1 += t_loss1.item() train_loss2 += t_loss2.item( ) # visual loss1 and loss2 in train stage train_acc += get_acc(output_[0], targets) valid_loss = 0 valid_acc = 0 if val_loader is not None and cfg.DATASETS.NAME == "CIFAR100": model = model.eval() with torch.no_grad(): for images, targets in val_loader: images = images.to(device) targets = targets.to(device) output_ = model(images) # v_loss = metric_fc(feature, targets) v_loss = loss_fn( output_[0], targets) # Only amsoftmax loss is considered here valid_loss += v_loss.item() valid_acc += get_acc(output_[0], targets) avg_t_loss = train_loss / len(train_loader) avg_t1_loss = train_loss1 / len(train_loader) avg_t2_loss = train_loss2 / len(train_loader) avg_v_loss = valid_loss / len(val_loader) avg_train_acc = train_acc / len(train_loader) avg_val_acc = valid_acc / len(val_loader) lr = lr_schedule.get_lr()[0] epoch_str = f"Epoch {epoch}: Train Loss1: {avg_t1_loss}, Train Loss2: {avg_t2_loss}, Train Loss: {avg_t_loss}, " \ f"Train Acc: {avg_train_acc}, Valid Loss: {avg_v_loss}, Valid Acc: {avg_val_acc}, LR: {lr} " logger.info(epoch_str) elif cfg.DATASETS.NAME == "FACE_DATA": pass torch.save(model.state_dict(), f"{output_dir}/model.pth") # 一周期保存一次