def save_model(model, cfg, file_prefix): """ save model and associated config file so we know what parameters were used to generate a given model :param model: model to be saved :param cfg: config :param file_prefix: file name prefix (without extension) of the model and config file name. Model is saved as a .pth file and config as a txt file :return: None """ dir_path = os.path.dirname(os.path.realpath(__file__)) predicate = os.path.join(dir_path, '..\\data', file_prefix) torch.save(model.state_dict(), predicate + ".pth") f = open(predicate + ".txt", "w") f.write(str(cfg)) f.close()
+ np.sum(output.cpu().data.numpy()[target.cpu().data.numpy()==0] < 0.5)) / float(args.im_size[1]*args.im_size[2]) n_examples += output.size(0) if n_batches and (batch_i == n_batches-1): break loss /= n_examples acc /= n_examples return loss, acc if args.test: print("Running evaluation on test set.") test_loss, test_acc = evaluate('test') print('Test loss: %f Test accuracy: %f' % (test_loss, test_acc)) else: # train the model one epoch at a time metrics = {'iters':[], 'train_loss':[], 'val_loss':[], 'val_acc':[]} for epoch in range(1, args.epochs + 1): iters, train_losses, val_losses, val_accuracies = train(epoch) metrics['iters'] += iters metrics['train_loss'] += train_losses metrics['val_loss'] += val_losses metrics['val_acc'] += val_accuracies if (epoch % args.save_interval == 0 and args.save_model): save_path = os.path.join(backup_dir, 'IGVCModel' + '_' + str(epoch) + '.pt') print('Saving model: %s' % save_path) torch.save(model.state_dict(), save_path) metrics_path = os.path.join(backup_dir, 'metrics.npy') np.save(metrics_path, metrics)
def train(model, criterion, converter, device, train_datasets, valid_datasets=None, pretrain=False): print('Device:', device) ''' data_parallel = False if torch.cuda.device_count() > 1: print("Use", torch.cuda.device_count(), 'gpus') data_parallel = True model = nn.DataParallel(model) ''' model = model.to(device) if pretrain: #print("Using pretrained model") ''' state_dict = torch.load("/home/chen-ubuntu/Desktop/checks_dataset/pths/crnn_pertrain.pth", map_location=device) cnn_modules = {} rnn_modules = {} for module in state_dict: if module.split('.')[1] == 'FeatureExtraction': key = module.replace("module.FeatureExtraction.", "") cnn_modules[key] = state_dict[module] elif module.split('.')[1] == 'SequenceModeling': key = module.replace("module.SequenceModeling.", "") rnn_modules[key] = state_dict[module] model.cnn.load_state_dict(cnn_modules) model.rnn.load_state_dict(rnn_modules) ''' #model.load_state_dict(torch.load('/root/checks_recognize_v2/pths/hand_num_epoch278_acc0.995020.pth')) dataset_name = 'symbol' batch_dict = { 'print_word': 32, 'hand_num': 48, 'print_num': 48, 'symbol': 64, 'hand_word': 64, 'seal': 64, 'catword': 32 } dataset = train_datasets.get(dataset_name) dataloader = DataLoader(dataset, batch_size=batch_dict.get(dataset_name), shuffle=True, num_workers=4, drop_last=False) lr = 1e-3 params = model.parameters() optimizer = optim.Adam(params, lr) optimizer.zero_grad() batch_cnt = 0 for epoch in range(config.epochs): epoch_loss = 0 model.train() train_acc = 0 train_acc_cnt = 0 for i, (img, label, _) in enumerate(dataloader): n_correct = 0 batch_cnt += 1 train_acc_cnt += 1 img = img.to(device) text, length = converter.encode(label) preds = model(img) preds_size = torch.IntTensor([preds.size(0)] * img.size(0)) preds = preds.to('cpu') loss = criterion(preds, text, preds_size, length) _, preds = preds.max(2) preds = preds.transpose(1, 0).contiguous().view(-1) sim_preds = converter.decode(preds.data, preds_size.data, raw=False) list1 = [x for x in label] for pred, target in zip(sim_preds, list1): if pred == target: n_correct += 1 # loss.backward() # optimizer.step() # model.zero_grad() loss.backward() if (i + 1) % 4: optimizer.step() optimizer.zero_grad() epoch_loss += loss.item() train_acc += n_correct / len(list1) if (i + 1) % 4 == 0: print("epoch: {:<3d}, dataset:{:<8}, batch: {:<3d}, batch loss: {:4f}, epoch loss: {:4f}, acc: {}". \ format(epoch, dataset_name, i, loss.item(), epoch_loss, n_correct / len(list1))) # writer.add_scalar('data/train_loss', loss.item(), batch_cnt) # writer.add_scalar('data/train_acc', n_correct/len(list1), batch_cnt) print('==========train_average_acc is: {:.3f}'.format(train_acc / train_acc_cnt)) # writer.add_scalar('data/valid_{}acc'.format(dataset_name), train_acc/train_acc_cnt, batch_cnt) if epoch % 3 == 0: dataset_names = [dataset_name] accs, valid_losses = valid(model, criterion, converter, device, valid_datasets, dataset_names) acc, valid_loss = accs.get(dataset_name), valid_losses.get( dataset_name) print('========== valid acc: ', acc, ' ============valid loss: ', valid_loss) # writer.add_scalar('data/valid_{}acc'.format(dataset_name), acc, batch_cnt) # writer.add_scalar('data/valid_{}loss'.format(dataset_name), valid_loss, batch_cnt) if epoch % 3 == 0: state_dict = model.state_dict() torch.save( state_dict, '/root/last_dataset/crnn_char_pths/catword_lr3_epoch_{}_acc{:4f}.pth' .format(epoch + 1, train_acc / train_acc_cnt)) if train_acc / train_acc_cnt > 0.95: state_dict = model.state_dict() torch.save( state_dict, '/root/last_dataset/crnn_char_pths/catword_lr3_epoch{}_acc{:4f}.pth' .format(epoch + 1, train_acc / train_acc_cnt))
batch_size=1, num_workers=0, shuffle=True, collate_fn=my_collate) eval_loader = DataLoader(evalset, batch_size=1, num_workers=0, shuffle=False, collate_fn=my_collate) training_loss_sum = [] eval_loss_sum = [] rpn_cls_loss = [] roi_cls_loss = [] rpn_reg_loss = [] roi_reg_loss = [] for epoch in range(num_epoch): train() evaluate() plot_losses() if epoch > 0 and (epoch % 10) == 0: torch.save( model.state_dict(), os.path.join(models_path, f"faster_rcnn_{attempt}_{epoch}.pt")) else: torch.save(model.state_dict(), os.path.join(models_path, f"faster_rcnn_{attempt}.pt")) print("Done!")
def train(): tb = SummaryWriter(comment=f"LR_{args.lr}_BS_{args.batch_size}") images, labels = next(iter(train_loader)) grid = torchvision.utils.make_grid(images) tb.add_image("image", grid) tb.add_graph(model.to(device=device), images.to(device=device)) print("Batch Size: {} Learning Rate: {}".format(args.lr, args.batch_size)) for epoch in range(1, args.epochs + 1): t1 = time.time() batch_metrics = defaultdict(list) batch_metrics = { "iters": [], "lrs": [], "train_losses": [], "val_losses": [], "val_accuracies": [], } model.train() for batch_idx, batch in enumerate(train_loader): # prepare data images = Variable(batch[0]).to(device=device) targets = Variable(batch[1]).to(device=device) optimizer.zero_grad() outputs = model(images) loss = criterion(outputs, targets) loss.backward() optimizer.step() if args.vis and batch_idx % args.log_interval == 0 and images.shape[ 0] == 1: cv2.imshow("output: ", outputs.cpu().data.numpy()[0][0]) cv2.imshow("target: ", targets.cpu().data.numpy()[0][0]) cv2.waitKey(10) if batch_idx % args.log_interval == 0: val_loss, val_acc = evaluate("val", n_batches=args.val_size) train_loss = loss.item() batch_metrics["iters"].append( len(train_loader.dataset) * (epoch - 1) + batch_idx) batch_metrics["lrs"].append(lr) batch_metrics["train_losses"].append(train_loss) batch_metrics["val_losses"].append(val_loss) batch_metrics["val_accuracies"].append(val_acc) examples_this_epoch = batch_idx * len(images) epoch_progress = 100.0 * batch_idx / len(train_loader) print("Train Epoch: {} [{}/{} ({:.0f}%)]\t" "Train Loss: {:.4f}\tVal Loss: {:.4}\tVal Acc: {:.4}". format( epoch, examples_this_epoch, len(train_loader.dataset), epoch_progress, train_loss, val_loss, val_acc, )) print( "epoch: {} total train_loss: {:.4f} total val_loss: {:.4f} total val_acc: {:.4f}" .format( epoch, sum(batch_metrics["train_losses"]), sum(batch_metrics["val_losses"]), sum(batch_metrics["val_accuracies"]) / len(batch_metrics["val_accuracies"]), )) if epoch % args.save_interval == 0 and args.save_model: save_path = os.path.join(backup_dir, "IGVCModel" + "_" + str(epoch) + ".pt") print("Saving model: %s" % save_path) torch.save(model.state_dict(), save_path) tb.add_scalar("train loss", sum(batch_metrics["train_losses"]), epoch) tb.add_scalar("val loss", sum(batch_metrics["val_losses"]), epoch) tb.add_scalar( "val_acc", sum(batch_metrics["val_accuracies"]) / len(batch_metrics["val_accuracies"]), epoch, ) for name, weight in model.named_parameters(): tb.add_histogram(name, weight, epoch) tb.add_histogram("{}.grad".format(name), weight.grad, epoch) metrics_path = os.path.join(backup_dir, "metrics.npy") np.save(metrics_path, batch_metrics) t2 = time.time() print("training time: %.2fs" % (t2 - t1)) tb.close()
reg_criterion = RegLoss() if cfg.LOSS.REG else None # Create optimizer optimizer = optim.Adam(model.parameters(), lr=cfg.HYPER.LEARNING_RATE) best_loss = float('Inf') for epoch in range(cfg.HYPER.EPOCHS): # Start training train_loss = train_epoch(model, ee_criterion, vec_criterion, col_criterion, lim_criterion, ori_criterion, reg_criterion, optimizer, train_loader, train_target, epoch, logger, cfg.OTHERS.LOG_INTERVAL, writer, device) # Start testing test_loss = test_epoch(model, ee_criterion, vec_criterion, col_criterion, lim_criterion, ori_criterion, reg_criterion, test_loader, test_target, epoch, logger, cfg.OTHERS.LOG_INTERVAL, writer, device) # Save model if test_loss < best_loss: best_loss = test_loss torch.save( model.state_dict(), os.path.join(cfg.OTHERS.SAVE, "best_model_epoch_{:04d}.pth".format(epoch))) logger.info("Epoch {} Model Saved".format(epoch + 1).center( 60, '-'))
def main(): torch.manual_seed(1) os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu_devices print(args) # GPU / CPU device = torch.device('cuda') print("Initializing dataset") dataset = data_manager.init_dataset('../imdb/dataset_GEI', 'id_list.csv', args.cooperative) transform = transforms.Compose([ transforms.RandomAffine(degrees=0, translate=(0.05, 0.02)), transforms.ToTensor() ]) transform_test = transforms.Compose([transforms.ToTensor()]) # trainLoader trainLoader = DataLoader(ImageDataset(dataset.train, sample='random', transform=transform), sampler=RandomIdentitySampler(dataset.train, num_instances=2), batch_size=args.train_batch, num_workers=args.workers) # test/val queryLoader # test/val galleryLoader test_probeLoader = DataLoader(ImageDataset(dataset.test_probe, sample='dense', transform=transform_test), shuffle=False, batch_size=args.test_batch, drop_last=False) test_galleryLoader = DataLoader(ImageDataset(dataset.test_gallery, sample='dense', transform=transform_test), shuffle=False, batch_size=args.test_batch, drop_last=False) model = models.model.ICDNet_group_mask_mask_early_8().to(device=device) #model = models.model.ICDNet_mask() #model= nn.DataParallel(model).cuda() #model = models.model.icdnet().to(device=device) print("Model size: {:.5f}M".format( sum(p.numel() for p in model.parameters()) / 1000000.0)) criterion_cont = OnlineContrastiveLoss(margin=3) #criterion_trip = OnlineTripletLoss(3) criterion_trip = TripletLoss(3) criterion_sim = OnlineSimLoss() criterion_l2 = nn.MSELoss() criterion_label = nn.CrossEntropyLoss() optimizer = torch.optim.Adam(model.parameters(), lr=args.lr, betas=(0.5, 0.999)) #scheduler = lr_scheduler.StepLR(optimizer, step_size=1, gamma=0.1) scheduler = lr_scheduler.MultiStepLR(optimizer, [140], gamma=0.1, last_epoch=-1) #checkpoint = torch.load('./save_group_mask_early8_ones2_0002_sa3_500l2_01label_resbottle_shift002_all190_coo0/ep87.pth.tar') #model.load_state_dict(checkpoint['state_dict']) start_time = time.time() best_rank1 = -np.inf #args.max_epoch = 1 cont_iter = 1 for epoch in range(args.start_epoch, args.max_epoch): print("==> {}/{}".format(epoch + 1, args.max_epoch)) cont_iter = train(epoch, model, criterion_cont, criterion_trip, criterion_sim, criterion_l2, criterion_label, optimizer, scheduler, trainLoader, device, cont_iter) if cont_iter > 250000: break if True: print("=============> Test") test_f.write("iter" + str(cont_iter) + '\n') rank1, correct_rate = test(model, test_probeLoader, test_galleryLoader, device) writer.add_scalar("Test/rank1", rank1, epoch) writer.add_scalar("Test/correct", correct_rate, epoch) is_best = rank1 > best_rank1 if is_best: best_rank1 = rank1 if is_best: state_dict = model.state_dict() save_checkpoint( { 'state_dict': state_dict, 'epoch': epoch, 'optimizer': optimizer.state_dict(), }, is_best, osp.join(args.save_dir, 'ep' + str(epoch + 1) + '.pth.tar')) elapsed = round(time.time() - start_time) elapsed = str(datetime.timedelta(seconds=elapsed)) print("Finished. Total elapsed time (h:m:s): {}".format(elapsed))
if step % record_interval == record_interval - 1: test_ce, test_prauc, test_rce = test(model, test_loader) writer.add_scalars('loss/ce', {'val': test_ce}, step) writer.add_scalars('loss/prauc', {'val': test_prauc}, step) writer.add_scalars('loss/rce', {'val': test_rce}, step) writer.add_scalars( 'lr', {'lr': optimizer.state_dict()['param_groups'][0]['lr']}, step) # sheduler.step(test_ce) if calc_score(test_prauc, test_rce) > calc_score( max_score[0], max_score[1]): max_score = (test_prauc, test_rce, step) torch.save( { 'model_state_dict': model.state_dict(), 'optimizer_state_dict': optimizer.state_dict(), # 'sheduler_state_dict':sheduler.state_dict(), 'step': step, 'max_score': max_score }, os.path.join(checkpoints_dir, model_name + '_best.pt')) if save_latest: torch.save( { 'model_state_dict': model.state_dict(), 'optimizer_state_dict': optimizer.state_dict(), 'step': step, #'sheduler_state_dict':sheduler.state_dict(), 'max_score': max_score