Exemplo n.º 1
0
def save_model(model, cfg, file_prefix):
    """
    save model and associated config file so we know what parameters were used to generate a given model
    :param model: model to be saved
    :param cfg: config
    :param file_prefix: file name prefix (without extension) of the model and config file name. Model is saved as a .pth file
    and config as a txt file
    :return: None
    """
    dir_path = os.path.dirname(os.path.realpath(__file__))
    predicate = os.path.join(dir_path, '..\\data', file_prefix)
    torch.save(model.state_dict(), predicate + ".pth")
    f = open(predicate + ".txt", "w")
    f.write(str(cfg))
    f.close()
Exemplo n.º 2
0
            + np.sum(output.cpu().data.numpy()[target.cpu().data.numpy()==0] < 0.5)) / float(args.im_size[1]*args.im_size[2])
        n_examples += output.size(0)

        if n_batches and (batch_i == n_batches-1):
            break

    loss /= n_examples
    acc /= n_examples
    return loss, acc

if args.test:
    print("Running evaluation on test set.")
    test_loss, test_acc = evaluate('test')
    print('Test loss: %f  Test accuracy: %f' % (test_loss, test_acc))
else:
    # train the model one epoch at a time
    metrics = {'iters':[], 'train_loss':[], 'val_loss':[], 'val_acc':[]}
    for epoch in range(1, args.epochs + 1):
        iters, train_losses, val_losses, val_accuracies = train(epoch)
        metrics['iters'] += iters
        metrics['train_loss'] += train_losses
        metrics['val_loss'] += val_losses
        metrics['val_acc'] += val_accuracies
        if (epoch % args.save_interval == 0 and args.save_model):
            save_path = os.path.join(backup_dir, 'IGVCModel' + '_' + str(epoch) + '.pt')
            print('Saving model: %s' % save_path)
            torch.save(model.state_dict(), save_path)
        metrics_path = os.path.join(backup_dir, 'metrics.npy')
        np.save(metrics_path, metrics)

Exemplo n.º 3
0
def train(model,
          criterion,
          converter,
          device,
          train_datasets,
          valid_datasets=None,
          pretrain=False):
    print('Device:', device)
    '''
    data_parallel = False
    if torch.cuda.device_count() > 1:
        print("Use", torch.cuda.device_count(), 'gpus')
        data_parallel = True
        model = nn.DataParallel(model)
    '''
    model = model.to(device)

    if pretrain:
        #print("Using pretrained model")
        '''
        state_dict = torch.load("/home/chen-ubuntu/Desktop/checks_dataset/pths/crnn_pertrain.pth", map_location=device)

        cnn_modules = {}
        rnn_modules = {}
        for module in state_dict:
            if module.split('.')[1] == 'FeatureExtraction':
                key = module.replace("module.FeatureExtraction.", "")
                cnn_modules[key] = state_dict[module]
            elif module.split('.')[1] == 'SequenceModeling':
                key = module.replace("module.SequenceModeling.", "")
                rnn_modules[key] = state_dict[module]

        model.cnn.load_state_dict(cnn_modules)
        model.rnn.load_state_dict(rnn_modules)
        '''
        #model.load_state_dict(torch.load('/root/checks_recognize_v2/pths/hand_num_epoch278_acc0.995020.pth'))

    dataset_name = 'symbol'
    batch_dict = {
        'print_word': 32,
        'hand_num': 48,
        'print_num': 48,
        'symbol': 64,
        'hand_word': 64,
        'seal': 64,
        'catword': 32
    }
    dataset = train_datasets.get(dataset_name)
    dataloader = DataLoader(dataset,
                            batch_size=batch_dict.get(dataset_name),
                            shuffle=True,
                            num_workers=4,
                            drop_last=False)

    lr = 1e-3
    params = model.parameters()
    optimizer = optim.Adam(params, lr)
    optimizer.zero_grad()
    batch_cnt = 0
    for epoch in range(config.epochs):
        epoch_loss = 0
        model.train()
        train_acc = 0
        train_acc_cnt = 0

        for i, (img, label, _) in enumerate(dataloader):
            n_correct = 0
            batch_cnt += 1
            train_acc_cnt += 1
            img = img.to(device)
            text, length = converter.encode(label)
            preds = model(img)
            preds_size = torch.IntTensor([preds.size(0)] * img.size(0))
            preds = preds.to('cpu')
            loss = criterion(preds, text, preds_size, length)

            _, preds = preds.max(2)
            preds = preds.transpose(1, 0).contiguous().view(-1)
            sim_preds = converter.decode(preds.data,
                                         preds_size.data,
                                         raw=False)

            list1 = [x for x in label]
            for pred, target in zip(sim_preds, list1):
                if pred == target:
                    n_correct += 1

            # loss.backward()
            # optimizer.step()
            # model.zero_grad()

            loss.backward()
            if (i + 1) % 4:
                optimizer.step()
                optimizer.zero_grad()

            epoch_loss += loss.item()
            train_acc += n_correct / len(list1)

            if (i + 1) % 4 == 0:
                print("epoch: {:<3d}, dataset:{:<8}, batch: {:<3d},  batch loss: {:4f}, epoch loss: {:4f}, acc: {}". \
                      format(epoch, dataset_name, i, loss.item(), epoch_loss, n_correct / len(list1)))
                # writer.add_scalar('data/train_loss', loss.item(), batch_cnt)
                # writer.add_scalar('data/train_acc', n_correct/len(list1), batch_cnt)

        print('==========train_average_acc is: {:.3f}'.format(train_acc /
                                                              train_acc_cnt))
        # writer.add_scalar('data/valid_{}acc'.format(dataset_name), train_acc/train_acc_cnt, batch_cnt)

        if epoch % 3 == 0:
            dataset_names = [dataset_name]
            accs, valid_losses = valid(model, criterion, converter, device,
                                       valid_datasets, dataset_names)
            acc, valid_loss = accs.get(dataset_name), valid_losses.get(
                dataset_name)
            print('========== valid acc: ', acc, '  ============valid loss: ',
                  valid_loss)

        # writer.add_scalar('data/valid_{}acc'.format(dataset_name), acc, batch_cnt)
        # writer.add_scalar('data/valid_{}loss'.format(dataset_name), valid_loss, batch_cnt)

        if epoch % 3 == 0:
            state_dict = model.state_dict()
            torch.save(
                state_dict,
                '/root/last_dataset/crnn_char_pths/catword_lr3_epoch_{}_acc{:4f}.pth'
                .format(epoch + 1, train_acc / train_acc_cnt))

        if train_acc / train_acc_cnt > 0.95:
            state_dict = model.state_dict()
            torch.save(
                state_dict,
                '/root/last_dataset/crnn_char_pths/catword_lr3_epoch{}_acc{:4f}.pth'
                .format(epoch + 1, train_acc / train_acc_cnt))
Exemplo n.º 4
0
                              batch_size=1,
                              num_workers=0,
                              shuffle=True,
                              collate_fn=my_collate)
    eval_loader = DataLoader(evalset,
                             batch_size=1,
                             num_workers=0,
                             shuffle=False,
                             collate_fn=my_collate)

    training_loss_sum = []
    eval_loss_sum = []
    rpn_cls_loss = []
    roi_cls_loss = []
    rpn_reg_loss = []
    roi_reg_loss = []

    for epoch in range(num_epoch):
        train()
        evaluate()
        plot_losses()

        if epoch > 0 and (epoch % 10) == 0:
            torch.save(
                model.state_dict(),
                os.path.join(models_path, f"faster_rcnn_{attempt}_{epoch}.pt"))
        else:
            torch.save(model.state_dict(),
                       os.path.join(models_path, f"faster_rcnn_{attempt}.pt"))
    print("Done!")
Exemplo n.º 5
0
def train():
    tb = SummaryWriter(comment=f"LR_{args.lr}_BS_{args.batch_size}")
    images, labels = next(iter(train_loader))
    grid = torchvision.utils.make_grid(images)
    tb.add_image("image", grid)
    tb.add_graph(model.to(device=device), images.to(device=device))
    print("Batch Size: {} Learning Rate: {}".format(args.lr, args.batch_size))

    for epoch in range(1, args.epochs + 1):
        t1 = time.time()
        batch_metrics = defaultdict(list)
        batch_metrics = {
            "iters": [],
            "lrs": [],
            "train_losses": [],
            "val_losses": [],
            "val_accuracies": [],
        }
        model.train()
        for batch_idx, batch in enumerate(train_loader):
            # prepare data
            images = Variable(batch[0]).to(device=device)
            targets = Variable(batch[1]).to(device=device)

            optimizer.zero_grad()
            outputs = model(images)
            loss = criterion(outputs, targets)
            loss.backward()
            optimizer.step()

            if args.vis and batch_idx % args.log_interval == 0 and images.shape[
                    0] == 1:
                cv2.imshow("output: ", outputs.cpu().data.numpy()[0][0])
                cv2.imshow("target: ", targets.cpu().data.numpy()[0][0])
                cv2.waitKey(10)

            if batch_idx % args.log_interval == 0:
                val_loss, val_acc = evaluate("val", n_batches=args.val_size)
                train_loss = loss.item()
                batch_metrics["iters"].append(
                    len(train_loader.dataset) * (epoch - 1) + batch_idx)
                batch_metrics["lrs"].append(lr)
                batch_metrics["train_losses"].append(train_loss)
                batch_metrics["val_losses"].append(val_loss)
                batch_metrics["val_accuracies"].append(val_acc)

                examples_this_epoch = batch_idx * len(images)
                epoch_progress = 100.0 * batch_idx / len(train_loader)
                print("Train Epoch: {} [{}/{} ({:.0f}%)]\t"
                      "Train Loss: {:.4f}\tVal Loss: {:.4}\tVal Acc: {:.4}".
                      format(
                          epoch,
                          examples_this_epoch,
                          len(train_loader.dataset),
                          epoch_progress,
                          train_loss,
                          val_loss,
                          val_acc,
                      ))

        print(
            "epoch: {} total train_loss: {:.4f} total val_loss: {:.4f} total val_acc: {:.4f}"
            .format(
                epoch,
                sum(batch_metrics["train_losses"]),
                sum(batch_metrics["val_losses"]),
                sum(batch_metrics["val_accuracies"]) /
                len(batch_metrics["val_accuracies"]),
            ))

        if epoch % args.save_interval == 0 and args.save_model:
            save_path = os.path.join(backup_dir,
                                     "IGVCModel" + "_" + str(epoch) + ".pt")
            print("Saving model: %s" % save_path)
            torch.save(model.state_dict(), save_path)

        tb.add_scalar("train loss", sum(batch_metrics["train_losses"]), epoch)
        tb.add_scalar("val loss", sum(batch_metrics["val_losses"]), epoch)
        tb.add_scalar(
            "val_acc",
            sum(batch_metrics["val_accuracies"]) /
            len(batch_metrics["val_accuracies"]),
            epoch,
        )

        for name, weight in model.named_parameters():
            tb.add_histogram(name, weight, epoch)
            tb.add_histogram("{}.grad".format(name), weight.grad, epoch)

        metrics_path = os.path.join(backup_dir, "metrics.npy")
        np.save(metrics_path, batch_metrics)
        t2 = time.time()
        print("training time: %.2fs" % (t2 - t1))
    tb.close()
Exemplo n.º 6
0
    reg_criterion = RegLoss() if cfg.LOSS.REG else None

    # Create optimizer
    optimizer = optim.Adam(model.parameters(), lr=cfg.HYPER.LEARNING_RATE)

    best_loss = float('Inf')

    for epoch in range(cfg.HYPER.EPOCHS):
        # Start training
        train_loss = train_epoch(model, ee_criterion, vec_criterion,
                                 col_criterion, lim_criterion, ori_criterion,
                                 reg_criterion, optimizer, train_loader,
                                 train_target, epoch, logger,
                                 cfg.OTHERS.LOG_INTERVAL, writer, device)

        # Start testing
        test_loss = test_epoch(model, ee_criterion, vec_criterion,
                               col_criterion, lim_criterion, ori_criterion,
                               reg_criterion, test_loader, test_target, epoch,
                               logger, cfg.OTHERS.LOG_INTERVAL, writer, device)

        # Save model
        if test_loss < best_loss:
            best_loss = test_loss
            torch.save(
                model.state_dict(),
                os.path.join(cfg.OTHERS.SAVE,
                             "best_model_epoch_{:04d}.pth".format(epoch)))
            logger.info("Epoch {} Model Saved".format(epoch + 1).center(
                60, '-'))
Exemplo n.º 7
0
def main():
    torch.manual_seed(1)
    os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu_devices
    print(args)
    # GPU / CPU
    device = torch.device('cuda')

    print("Initializing dataset")
    dataset = data_manager.init_dataset('../imdb/dataset_GEI', 'id_list.csv',
                                        args.cooperative)

    transform = transforms.Compose([
        transforms.RandomAffine(degrees=0, translate=(0.05, 0.02)),
        transforms.ToTensor()
    ])
    transform_test = transforms.Compose([transforms.ToTensor()])
    # trainLoader
    trainLoader = DataLoader(ImageDataset(dataset.train,
                                          sample='random',
                                          transform=transform),
                             sampler=RandomIdentitySampler(dataset.train,
                                                           num_instances=2),
                             batch_size=args.train_batch,
                             num_workers=args.workers)

    # test/val queryLoader
    # test/val galleryLoader
    test_probeLoader = DataLoader(ImageDataset(dataset.test_probe,
                                               sample='dense',
                                               transform=transform_test),
                                  shuffle=False,
                                  batch_size=args.test_batch,
                                  drop_last=False)

    test_galleryLoader = DataLoader(ImageDataset(dataset.test_gallery,
                                                 sample='dense',
                                                 transform=transform_test),
                                    shuffle=False,
                                    batch_size=args.test_batch,
                                    drop_last=False)
    model = models.model.ICDNet_group_mask_mask_early_8().to(device=device)
    #model = models.model.ICDNet_mask()
    #model= nn.DataParallel(model).cuda()
    #model = models.model.icdnet().to(device=device)
    print("Model size: {:.5f}M".format(
        sum(p.numel() for p in model.parameters()) / 1000000.0))

    criterion_cont = OnlineContrastiveLoss(margin=3)
    #criterion_trip = OnlineTripletLoss(3)
    criterion_trip = TripletLoss(3)
    criterion_sim = OnlineSimLoss()
    criterion_l2 = nn.MSELoss()
    criterion_label = nn.CrossEntropyLoss()

    optimizer = torch.optim.Adam(model.parameters(),
                                 lr=args.lr,
                                 betas=(0.5, 0.999))
    #scheduler = lr_scheduler.StepLR(optimizer, step_size=1, gamma=0.1)
    scheduler = lr_scheduler.MultiStepLR(optimizer, [140],
                                         gamma=0.1,
                                         last_epoch=-1)

    #checkpoint = torch.load('./save_group_mask_early8_ones2_0002_sa3_500l2_01label_resbottle_shift002_all190_coo0/ep87.pth.tar')
    #model.load_state_dict(checkpoint['state_dict'])
    start_time = time.time()
    best_rank1 = -np.inf
    #args.max_epoch = 1
    cont_iter = 1
    for epoch in range(args.start_epoch, args.max_epoch):
        print("==> {}/{}".format(epoch + 1, args.max_epoch))
        cont_iter = train(epoch, model, criterion_cont, criterion_trip,
                          criterion_sim, criterion_l2, criterion_label,
                          optimizer, scheduler, trainLoader, device, cont_iter)
        if cont_iter > 250000:
            break
        if True:
            print("=============> Test")
            test_f.write("iter" + str(cont_iter) + '\n')
            rank1, correct_rate = test(model, test_probeLoader,
                                       test_galleryLoader, device)
            writer.add_scalar("Test/rank1", rank1, epoch)
            writer.add_scalar("Test/correct", correct_rate, epoch)
            is_best = rank1 > best_rank1
            if is_best:
                best_rank1 = rank1
            if is_best:
                state_dict = model.state_dict()
                save_checkpoint(
                    {
                        'state_dict': state_dict,
                        'epoch': epoch,
                        'optimizer': optimizer.state_dict(),
                    }, is_best,
                    osp.join(args.save_dir,
                             'ep' + str(epoch + 1) + '.pth.tar'))

    elapsed = round(time.time() - start_time)
    elapsed = str(datetime.timedelta(seconds=elapsed))
    print("Finished. Total elapsed time (h:m:s): {}".format(elapsed))
Exemplo n.º 8
0
        if step % record_interval == record_interval - 1:
            test_ce, test_prauc, test_rce = test(model, test_loader)
            writer.add_scalars('loss/ce', {'val': test_ce}, step)
            writer.add_scalars('loss/prauc', {'val': test_prauc}, step)
            writer.add_scalars('loss/rce', {'val': test_rce}, step)
            writer.add_scalars(
                'lr', {'lr': optimizer.state_dict()['param_groups'][0]['lr']},
                step)
            # sheduler.step(test_ce)

            if calc_score(test_prauc, test_rce) > calc_score(
                    max_score[0], max_score[1]):
                max_score = (test_prauc, test_rce, step)
                torch.save(
                    {
                        'model_state_dict': model.state_dict(),
                        'optimizer_state_dict': optimizer.state_dict(),
                        # 'sheduler_state_dict':sheduler.state_dict(),
                        'step': step,
                        'max_score': max_score
                    },
                    os.path.join(checkpoints_dir, model_name + '_best.pt'))

    if save_latest:
        torch.save(
            {
                'model_state_dict': model.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
                'step': step,
                #'sheduler_state_dict':sheduler.state_dict(),
                'max_score': max_score