示例#1
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--epochs', default=50, type=int, help='epoch number')
    parser.add_argument('-b', '--batch_size', default=64, type=int, help='mini-batch size')
    parser.add_argument('--lr', '--learning_rate', default=1e-4, type=float, help='initial learning rate')
    parser.add_argument('-c', '--continue', dest='continue_path', type=str, required=False)
    parser.add_argument('--state_dict', default=None, type=str, required=False,
                        help='state_dict when doing full training ')
    parser.add_argument('--exp_name', default=config.exp_name, type=str, required=False)
    parser.add_argument('--drop_rate', default=0, type=float, required=False)
    parser.add_argument('--local', action='store_true', help='train local branch')
    args = parser.parse_args()
    print(args)

    config.exp_name = args.exp_name
    config.make_dir()
    save_args(args, config.log_dir)

    # get network
    if args.state_dict is not None:
        state_dict = torch.load(args.state_dict)
        net = fusenet()
        net.load_state_dict(state_dict)
        net.set_fcweights()
    else:
        global_branch_state = torch.load(GLOBAL_BRANCH_DIR)
        local_branch_state = torch.load(LOCAL_BRANCH_DIR)
        net = fusenet(global_branch_state, local_branch_state)

    net.to(config.device)
    sess = Session(config, net=net)

    # get dataloader
    train_loader = get_dataloaders('train', batch_size=args.batch_size,
                                   shuffle=True)

    valid_loader = get_dataloaders('valid', batch_size=args.batch_size,
                                   shuffle=False)

    if args.continue_path and os.path.exists(args.continue_path):
        sess.load_checkpoint(args.continue_path)

    # start session
    clock = sess.clock
    tb_writer = sess.tb_writer
    sess.save_checkpoint('start.pth.tar')

    # set criterion, optimizer and scheduler
    criterion = nn.BCELoss().cuda()

    if args.local:  # train local branch
        optimizer = optim.Adam(sess.net.module.local_branch.parameters(), args.lr)
    else:   # train final fc layer
        optimizer = optim.Adam(sess.net.classifier.parameters(), args.lr)

    scheduler = ReduceLROnPlateau(optimizer, 'max', factor=0.1,  patience=10, verbose=True)

    # start training
    for e in range(args.epochs):
        train_out = train_model(train_loader, sess.net,
                                criterion, optimizer, clock.epoch)
        valid_out = valid_model(valid_loader, sess.net,
                                criterion, optimizer, clock.epoch)

        tb_writer.add_scalars('loss', {'train': train_out['epoch_loss'],
                                       'valid': valid_out['epoch_loss']}, clock.epoch)

        tb_writer.add_scalars('acc', {'train': train_out['epoch_acc'],
                                      'valid': valid_out['epoch_acc']}, clock.epoch)

        tb_writer.add_scalar('auc', valid_out['epoch_auc'], clock.epoch)

        tb_writer.add_scalar('learning_rate', optimizer.param_groups[-1]['lr'], clock.epoch)
        scheduler.step(valid_out['epoch_auc'])

        if valid_out['epoch_auc'] > sess.best_val_acc:
            sess.best_val_acc = valid_out['epoch_auc']
            sess.save_checkpoint('best_model.pth.tar')

        if clock.epoch % 10 == 0:
            sess.save_checkpoint('epoch{}.pth.tar'.format(clock.epoch))
        sess.save_checkpoint('latest.pth.tar')

        clock.tock()
def train_net(model, file_path, in_seq_len, out_seq_len, pre_model, save_dir,
              batch_size, lr, log_after, cuda, device):
    print(model)
    # if os.path.exists('runs'):
    #     import shutil
    #     shutil.rmtree('runs') # just in case...
    if not os.path.exists(save_dir):
        os.mkdir(save_dir)
    if cuda:
        print('GPU')
        model.cuda(device=device)
        print('log: training started on device: {}'.format(device))
    writer = SummaryWriter()
    optimizer = Adam(model.parameters(), lr=lr)
    criterion = nn.MSELoss()
    train_dataloader, val_dataloader, test_dataloader = get_dataloaders(
        file_path=file_path,
        in_seq_len=in_seq_len,
        out_seq_len=out_seq_len,
        batch_size=batch_size)
    if True:
        i = 1
        m_loss, m_accuracy = [], []
        if pre_model:
            model.load_state_dict(torch.load(pre_model))
            print('log: resumed model {} successfully!'.format(pre_model))
            # starting point
            model_number = int(re.findall('\d+', str(pre_model))[0])
            i = i + model_number - 1
        else:
            print("log: let's start from the beginning...")

        while True:
            i += 1
            net_loss = []
            # new model path
            save_path = os.path.join(save_dir, 'model-{}.pt'.format(i))
            # remember to save only five previous models, so
            del_this = os.path.join(save_dir, 'model-{}.pt'.format(i - 6))
            if os.path.exists(del_this):
                os.remove(del_this)
                print('log: removed {}'.format(del_this))

            if i > 1 and not os.path.exists(save_path):
                torch.save(model.state_dict(), save_path)
                print('log: saved {}'.format(save_path))

            for idx, data in enumerate(train_dataloader, 1):
                ##########################
                model.train()  # train mode at each epoch, just in case...
                #################################
                test_x, label = data['input'].unsqueeze(
                    2), data['label'].squeeze(1)
                if cuda:
                    test_x = test_x.cuda(device=device)
                    label = label.cuda(device=device)
                out_x, h_n = model.continuous_forward(test_x,
                                                      out_seq_len=out_seq_len)
                loss = criterion(out_x.view_as(label), label)
                net_loss.append(loss.item())
                if idx % log_after == 0 and idx > 0:
                    print('{}. ({}/{}) image size = {}, loss = {}'.format(
                        i, idx, len(train_dataloader), out_x.size(),
                        loss.item()))
                #################################
                # three steps for backprop
                model.zero_grad()
                loss.backward()
                # perform gradient clipping between loss backward and optimizer step
                clip_grad_norm_(model.parameters(), 0.05)
                optimizer.step()
                #################################
            mean_loss = np.asarray(net_loss).sum() / idx
            m_loss.append((i, mean_loss))
            writer.add_scalar(tag='train_loss',
                              scalar_value=mean_loss,
                              global_step=i)
            print('####################################')
            print('in_shape = {}, out_shape = {}'.format(
                test_x.shape, out_x.shape))
            print('epoch {} -> total loss = {:.5f}'.format(i, mean_loss))
            print('####################################')

            # validate model after each epoch
            eval_net(model=model,
                     out_seq_len=out_seq_len,
                     writer=writer,
                     criterion=criterion,
                     val_loader=val_dataloader,
                     denominator=batch_size,
                     cuda=cuda,
                     device=device,
                     global_step=i)
    pass
示例#3
0
def train_net(model, data_path, pre_model, save_dir, batch_size, lr, log_after, cuda, device, one_hot=False):
    if not pre_model:
        print(model)
    writer = SummaryWriter()
    if cuda:
        print('GPU')
        model.cuda(device=device)
        print('log: training started on device: {}'.format(device))
    # define loss and optimizer
    optimizer = Adam(model.parameters(), lr=lr)
    lr_final = 0.0000003
    num_epochs = 500
    LR_decay = (lr_final/lr)**(1./num_epochs)
    scheduler = lr_scheduler.ExponentialLR(optimizer=optimizer, gamma=LR_decay)
    # print(LR_decay, optimizer.state)
    # print(optimizer.param_groups[0]['lr'])
    # criterion = nn.CrossEntropyLoss()
    criterion = nn.CrossEntropyLoss()
    train_loader, val_dataloader, test_loader = get_dataloaders(path_to_nparray=data_path,
                                                                batch_size=batch_size,
                                                                normalize=True)
    if not os.path.exists(save_dir):
        os.mkdir(save_dir)

    if True:
        i = 1
        m_loss, m_accuracy = [], []
        if pre_model:
            # self.load_state_dict(torch.load(pre_model)['model'])
            model.load_state_dict(torch.load(os.path.join(save_dir, "model-"+pre_model+'.pt')))
            print('log: resumed model {} successfully!'.format(pre_model))
            print(model)

            # starting point
            # model_number = int(pre_model.split('/')[1].split('-')[1].split('.')[0])
            model_number = int(pre_model) #re.findall('\d+', str(pre_model))[0])
            i = i + model_number - 1
        else:
            print('log: starting anew...')

        while i < num_epochs:
            i += 1
            net_loss = []
            # new model path
            save_path = os.path.join(save_dir, 'model-{}.pt'.format(i))
            # remember to save only five previous models, so
            del_this = os.path.join(save_dir, 'model-{}.pt'.format(i-5))
            if os.path.exists(del_this):
                os.remove(del_this)
                print('log: removed {}'.format(del_this))

            if i > 1 and not os.path.exists(save_path):
                torch.save(model.state_dict(), save_path)
                print('log: saved {}'.format(save_path))

            correct_count, total_count = 0, 0
            for idx, data in enumerate(train_loader):
                ##########################
                model.train() # train mode at each epoch, just in case...
                ##########################
                test_x, label = data
                if cuda:
                    test_x = test_x.cuda(device=device)
                    label = label.cuda(device=device)
                # forward
                out_x, pred = model(test_x)
                # out_x, pred = out_x.cpu(), pred.cpu()
                loss = criterion(out_x, label)
                net_loss.append(loss.item())

                # get accuracy metric
                if one_hot:
                    batch_correct = (torch.argmax(label, dim=1).eq(pred.long())).double().sum().item()
                else:
                    batch_correct = (label.eq(pred.long())).double().sum().item()
                correct_count += batch_correct
                # print(batch_correct)
                total_count += np.float(pred.size(0))
                if idx % log_after == 0 and idx > 0:
                    print('{}. ({}/{}) image size = {}, loss = {}: accuracy = {}/{}'.format(i,
                                                                                            idx,
                                                                                            len(train_loader),
                                                                                            out_x.size(),
                                                                                            loss.item(),
                                                                                            batch_correct,
                                                                                            pred.size(0)))
                #################################
                # three steps for backprop
                model.zero_grad()
                loss.backward()
                # perform gradient clipping between loss backward and optimizer step
                clip_grad_norm_(model.parameters(), 0.05)
                optimizer.step()
                #################################
            # remember this should be in the epoch loop ;)
            scheduler.step()  # to dynamically change the learning rate
            mean_accuracy = correct_count / total_count * 100
            mean_loss = np.asarray(net_loss).mean()
            m_loss.append((i, mean_loss))
            m_accuracy.append((i, mean_accuracy))

            writer.add_scalar(tag='train loss', scalar_value=mean_loss, global_step=i)
            writer.add_scalar(tag='train over_all accuracy', scalar_value=mean_accuracy, global_step=i)

            print('####################################')
            print('epoch {} -> total loss = {:.5f}, total accuracy = {:.5f}% (lr: {})'.format(i,
                                                                                              mean_loss,
                                                                                              mean_accuracy,
                                                                                              optimizer.param_groups[0]['lr']))
            print('####################################')

            # validate model after each epoch
            with torch.no_grad():
                eval_net(model=model, writer=writer, criterion=criterion,
                         val_loader=val_dataloader, denominator=batch_size,
                         cuda=cuda, device=device, global_step=i, one_hot=one_hot)
    pass
示例#4
0
def train(cfg) -> None:

    learning_rate = cfg["learning_rate"]
    emb_dim = cfg["emb_dim"]
    dropout = cfg["dropout"]
    n_heads = cfg["n_heads"]
    n_encoder_layers = cfg["n_encoder_layers"]
    n_decoder_layers = cfg["n_decoder_layers"]
    dim_feedforward = cfg["dim_feedforward"]
    batch_size = cfg["batch_size"]
    validation_batch_size = cfg["validation_batch_size"]
    max_window_size = cfg["max_window_size"]
    num_workers = cfg["num_workers"]
    use_lectures = cfg["use_lectures"]
    use_prior_q_times = cfg["use_prior_q_times"]
    val_step_frequency = cfg["val_step_frequency"]
    val_size = cfg["val_size"]
    use_agg_feats = cfg["use_agg_feats"]
    use_exercise_feats = cfg["use_exercise_feats"]
    use_lgbm_feats = cfg["use_lgbm_feats"]
    concat_response_embeds = cfg["concat_response_embeds"]

    train_loader, val_loader = get_dataloaders(
        batch_size=batch_size,
        validation_batch_size=validation_batch_size,
        max_window_size=max_window_size,
        use_lectures=use_lectures,
        num_workers=num_workers,
        use_agg_feats=use_agg_feats,
    )

    # Init our model
    model = RIIDDTransformerModel(
        learning_rate=learning_rate,
        emb_dim=emb_dim,  # embedding dimension - this is for everything
        dropout=dropout,
        n_heads=n_heads,
        n_encoder_layers=n_encoder_layers,
        n_decoder_layers=n_decoder_layers,
        dim_feedforward=dim_feedforward,
        max_window_size=max_window_size,
        use_prior_q_times=use_prior_q_times,
        lr_step_frequency=val_step_frequency,
        use_agg_feats=use_agg_feats,
        use_exercise_feats=use_exercise_feats,
        use_lgbm_feats=use_lgbm_feats,
        concat_response_embeds=concat_response_embeds)

    experiment_name = f"concat_response_embeds"
    logger = TensorBoardLogger(f"{get_wd()}lightning_logs",
                               name=experiment_name)

    # Initialize a trainer
    trainer = pl.Trainer(
        gpus=1,
        max_epochs=1000,
        progress_bar_refresh_rate=1,
        callbacks=[
            EarlyStopping(monitor="avg_val_auc", patience=10, mode="max"),
            ModelCheckpoint(
                monitor="avg_val_auc",
                filename="{epoch}-{val_loss_step:.2f}-{avg_val_auc:.2f}",
                mode="max",
            ),
            LearningRateMonitor(logging_interval="epoch"),
        ],
        logger=logger,
        limit_train_batches=val_step_frequency,  # check validation every epoch
        limit_val_batches=val_size,  # run through only 10% of val every time
    )

    # Train the model ⚡
    trainer.fit(
        model,
        train_dataloader=train_loader,
        val_dataloaders=[val_loader],
    )

    # Test on Final Full validation set
    trainer.test(test_dataloaders=[val_loader])
示例#5
0
    # set up the handler
    filehandler = logging.FileHandler(
        os.path.join(opt.log_path, "convert-stdout.txt"))
    formatter = logging.Formatter(LOG_FORMAT)
    filehandler.setFormatter(formatter)

    # add a 'logfile' handler now!
    logging.getLogger().addHandler(filehandler)

    # set random seeds
    torch.manual_seed(opt.seed)
    torch.cuda.manual_seed(opt.seed)
    np.random.seed(opt.seed)

    # load model
    model, optimizer = load_stock_model.load_model_and_optimizer(
        opt, reload_model=True)

    # get datasets and dataloaders
    train_loader, train_dataset, test_loader, test_dataset = dataset.get_dataloaders(
        opt)

    try:
        # Train the model
        evaluate(opt, model, train_loader, "df-train.feather")
        evaluate(opt, model, test_loader, "df-test.feather")

    except KeyboardInterrupt:
        logging.info("Training got interrupted, saving log-files now.")
示例#6
0
out_channels = in_channels
num_training_updates = 25000
num_hiddens = 128
num_residual_hiddens = 32
num_residual_layers = 2
embedding_dim = 64
num_embeddings = 512
commitment_cost = 0.25
decay = 0.99
learning_rate = 3e-4

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# dset = MocapDataset()

train_loader, test_loader = get_dataloaders(batch_size=batch_size)

print('There are {} minibatches with {} batch_size in one epoch'.format(
    len(train_loader), train_loader.batch_size))

model = Model(in_channels,
              out_channels,
              num_hiddens=128,
              num_residual_layers=32,
              num_residual_hiddens=2,
              embedding_dim=64,
              num_embeddings=512,
              commitment_cost=commitment_cost,
              decay=decay).to(device)
optimizer = optim.Adam(model.parameters(), lr=learning_rate, amsgrad=True)
loss_function = nn.MSELoss()
def eval_net(**kwargs):
    model = kwargs['model']
    cuda = kwargs['cuda']
    device = kwargs['device']
    if cuda:
        model.cuda(device=device)
    if 'criterion' in kwargs.keys():
        writer = kwargs['writer']
        val_loader = kwargs['val_loader']
        criterion = kwargs['criterion']
        global_step = kwargs['global_step']
        correct_count, total_count = 0, 0
        net_loss = []
        model.eval()  # put in eval mode first ############################
        print('evaluating with batch size = 1')
        for idx, data in enumerate(val_loader):
            test_x, label = data['input'], data['label']
            if cuda:
                test_x = test_x.cuda(device=device)
                label = label.cuda(device=device)
            # forward
            out_x, pred = model.forward(test_x)
            loss = criterion(out_x, label)
            net_loss.append(loss.item())

            # get accuracy metric
            batch_correct = (label.eq(pred.long())).double().sum().item()
            correct_count += batch_correct
            total_count += np.float(pred.size(0))
        #################################
        mean_accuracy = correct_count / total_count * 100
        mean_loss = np.asarray(net_loss).mean()
        # summarize mean accuracy
        writer.add_scalar(tag='val. loss',
                          scalar_value=mean_loss,
                          global_step=global_step)
        writer.add_scalar(tag='val. over_all accuracy',
                          scalar_value=mean_accuracy,
                          global_step=global_step)
        print('$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$')
        print(
            'log: validation:: total loss = {:.5f}, total accuracy = {:.5f}%'.
            format(mean_loss, mean_accuracy))
        print('$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$')

    else:
        # model, images, labels, pre_model, save_dir, sum_dir, batch_size, lr, log_after, cuda
        pre_model = kwargs['pre_model']
        base_folder = kwargs['base_folder']
        batch_size = kwargs['batch_size']
        log_after = kwargs['log_after']
        criterion = nn.CrossEntropyLoss()
        un_confusion_meter = tnt.meter.ConfusionMeter(10, normalized=False)
        confusion_meter = tnt.meter.ConfusionMeter(10, normalized=True)
        model.load_state_dict(torch.load(pre_model))
        print('log: resumed model {} successfully!'.format(pre_model))
        _, _, test_loader = get_dataloaders(base_folder=base_folder,
                                            batch_size=batch_size)
        net_accuracy, net_loss = [], []
        correct_count = 0
        total_count = 0
        print('batch size = {}'.format(batch_size))
        model.eval()  # put in eval mode first
        for idx, data in enumerate(test_loader):
            # if idx == 1:
            #     break
            # print(model.training)
            test_x, label = data['input'], data['label']
            # print(test_x)
            # print(test_x.shape)
            # this = test_x.numpy().squeeze(0).transpose(1,2,0)
            # print(this.shape, np.min(this), np.max(this))
            if cuda:
                test_x = test_x.cuda(device=device)
                label = label.cuda(device=device)
            # forward
            out_x, pred = model.forward(test_x)
            loss = criterion(out_x, label)
            un_confusion_meter.add(predicted=pred, target=label)
            confusion_meter.add(predicted=pred, target=label)

            ###############################
            # pred = pred.view(-1)
            # pred = pred.cpu().numpy()
            # label = label.cpu().numpy()
            # print(pred.shape, label.shape)
            ###############################

            # get accuracy metric
            # correct_count += np.sum((pred == label))
            # print(pred, label)
            batch_correct = (label.eq(pred.long())).sum().item()
            correct_count += batch_correct
            # print(batch_correct)
            total_count += np.float(batch_size)
            net_loss.append(loss.item())
            if idx % log_after == 0:
                print('log: on {}'.format(idx))

            #################################
        mean_loss = np.asarray(net_loss).mean()
        mean_accuracy = correct_count * 100 / total_count
        print(correct_count, total_count)
        print('$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$')
        print(
            'log: test:: total loss = {:.5f}, total accuracy = {:.5f}%'.format(
                mean_loss, mean_accuracy))
        print('$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$')

        with open('normalized.pkl', 'wb') as this:
            pkl.dump(confusion_meter.value(),
                     this,
                     protocol=pkl.HIGHEST_PROTOCOL)
        with open('un_normalized.pkl', 'wb') as this:
            pkl.dump(un_confusion_meter.value(),
                     this,
                     protocol=pkl.HIGHEST_PROTOCOL)
        pass
    pass
示例#8
0
parser.add_argument('--progressbar', action='store_true', default=False,   help='Show progress bar during train/test.')
parser.add_argument('--evaluate', action='store_true', default=False,   help='Evaluation only using 25 clips per video')

##### Read in parameters
opt = parser.parse_args()

opt.multiple_clips = False
opt.kernels = multiprocessing.cpu_count()

"""=================================DATALOADER SETUPS====================="""
if torch.cuda.device_count() > 1:
    print("Let's use", torch.cuda.device_count(), "GPUs!")
    opt.bs = opt.bs * torch.cuda.device_count()

print('Total batch size: %d' % opt.bs)
dataloaders = dataset.get_dataloaders(opt)
if not opt.evaluate:
    opt.n_classes = dataloaders['training'][0].dataset.class_embed.shape[0]
else:
    opt.n_classes = dataloaders['testing'][0].dataset.class_embed.shape[0]

"""=================================OUTPUT FOLDER====================="""
opt.savename = opt.save_path + '/'
if not opt.evaluate:
    opt.savename += '%s/CLIP%d_LR%f_%s_BS%d' % (
            opt.dataset, opt.clip_len,
            opt.lr, opt.network, opt.bs)

    if opt.class_overlap > 0:
        opt.savename += '_CLASSOVERLAP%.2f' % opt.class_overlap
示例#9
0
文件: main.py 项目: frgfm/TorchHawk
def main(args):

    set_seed(SEED)

    train_transforms, test_transforms = get_transforms(args.dataset)
    print(f"Data transformations:\n{train_transforms}\n")

    # Get the dataloaders
    train_loader, test_loader = get_dataloaders(args.dataset, args.batch_size,
                                                args.workers, train_transforms,
                                                test_transforms)

    # Architecture
    if args.dataset == 'mnist':
        in_channels = 1
    else:
        raise NotImplementedError()
    if args.activation == 'relu':
        activation = nn.ReLU(inplace=True)
    else:
        raise NotImplementedError()
    if args.pooling == 'max':
        pooling = nn.MaxPool2d(kernel_size=(2, 2), stride=2)
    else:
        raise NotImplementedError()
    drop_rate = args.drop_rate

    # Build model
    model = LeNet5(in_channels, activation, pooling, drop_rate)
    if torch.cuda.is_available():
        torch.cuda.set_device(args.gpu)
        model = model.cuda()
    # Weight normal initialization
    if args.init_weights:
        model.apply(normal_initialization)

    start_epoch = 0
    if args.resume is not None:
        model, optimizer, start_epoch = load_training_state(
            model, optimizer, args.resume)

    # Loss function & optimizer
    if args.criterion == 'ce':
        criterion = nn.CrossEntropyLoss()
    else:
        raise NotImplementedError()
    if args.optimizer == 'sgd':
        # Issue
        optimizer = optim.SGD(model.parameters(),
                              lr=args.lr,
                              momentum=args.momentum,
                              weight_decay=args.weight_decay,
                              nesterov=args.nesterov)
    elif args.optimizer == 'adam':
        optimizer = optim.Adam(model.parameters(),
                               lr=args.lr,
                               weight_decay=args.weight_decay)
    else:
        raise NotImplementedError()

    scheduler = ReduceLROnPlateau(optimizer,
                                  factor=0.5,
                                  patience=0,
                                  threshold=1e-2,
                                  verbose=True)

    # Output folder
    output_folder = os.path.join(args.output_folder, args.training_name)
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)
    log_path = os.path.join(args.output_folder, 'logs', args.training_name)
    if os.path.exists(log_path):
        rmtree(log_path)
    logger = SummaryWriter(log_path)

    # Train
    best_loss = math.inf
    mb = master_bar(range(args.nb_epochs))
    for epoch_idx in mb:
        # Training
        train_epoch(model,
                    train_loader,
                    optimizer,
                    criterion,
                    mb,
                    tb_logger=logger,
                    epoch=start_epoch + epoch_idx)

        # Evaluation
        val_loss, accuracy = evaluate(model, test_loader, criterion)

        mb.first_bar.comment = f"Epoch {start_epoch+epoch_idx+1}/{start_epoch+args.nb_epochs}"
        mb.write(
            f'Epoch {start_epoch+epoch_idx+1}/{start_epoch+args.nb_epochs} - Validation loss: {val_loss:.4} (Acc@1: {accuracy:.2%})'
        )

        # State saving
        if val_loss < best_loss:
            print(
                f"Validation loss decreased {best_loss:.4} --> {val_loss:.4}: saving state..."
            )
            best_loss = val_loss
            torch.save(
                dict(epoch=start_epoch + epoch_idx,
                     model_state_dict=model.state_dict(),
                     optimizer_state_dict=optimizer.state_dict(),
                     val_loss=val_loss),
                os.path.join(output_folder, "training_state.pth"))

        if logger is not None:
            current_iter = (start_epoch + epoch_idx + 1) * len(train_loader)
            logger.add_scalar(f"Validation loss", val_loss, current_iter)
            logger.add_scalar(f"Error rate", 1 - accuracy, current_iter)
            logger.flush()
        scheduler.step(val_loss)
示例#10
0
from common import *
from dataset import get_dataloaders
from model import MURA_Net
from train import train_model
import os

dataloaders, dataset_sizes = get_dataloaders(
    study_name='XR_HUMERUS',
    data_dir='MURA-v1.0',
    batch_size=50,
    batch_eval_ten=15,
    shuffle=True
)

print(dataset_sizes)

model = MURA_Net()
model = model.to(device)

# model.load_state_dict(torch.load('models/model_XR_WRIST.pth'))
optimizer = torch.optim.Adam(model.parameters(), lr=0.0001)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', patience=1, verbose=True)

model = train_model(model, optimizer, dataloaders, scheduler, dataset_sizes, 500)
# torch.save(model.state_dict(), 'models/model_hand_auc.pth')

示例#11
0
def main():
    start_time = time.time()
    if not torch.cuda.is_available():
        logging.info('no gpu device available')
        sys.exit(1)

    torch.cuda.set_device(args.gpu)
    reproducibility(args.seed)
    logging.info('gpu device = %d' % args.gpu)
    logging.info("args = %s", args)

    criterion = nn.CrossEntropyLoss()
    criterion = criterion.cuda()
    model = Network(args.model_name,
                    CIFAR_CLASSES,
                    sub_policies,
                    args.use_cuda,
                    args.use_parallel,
                    temperature=args.temperature,
                    criterion=criterion)
    # model = model.cuda()
    logging.info("param size = %fMB", utils.count_parameters_in_MB(model))

    optimizer = torch.optim.SGD(model.parameters(),
                                args.learning_rate,
                                momentum=args.momentum,
                                weight_decay=args.weight_decay)

    # train_transform, valid_transform = utils._data_transforms_cifar10(args)
    # train_data = dset.CIFAR10(root=args.data, train=True, download=True, transform=train_transform)
    # train_data = AugmCIFAR10(
    #     root=args.data, train=True, download=True,
    #     transform=train_transform, ops_names=sub_policies, search=True, magnitudes=model.magnitudes)
    # valid_data = AugmCIFAR10(
    #     root=args.data, train=True, download=True,
    #     transform=train_transform, ops_names=sub_policies, search=False, magnitudes=model.magnitudes)

    # num_train = len(train_data)
    # indices = list(range(num_train))
    # split = int(np.floor(args.train_portion * num_train))

    # train_queue = torch.utils.data.DataLoader(
    #     train_data, batch_size=args.batch_size,
    #     sampler=torch.utils.data.sampler.SubsetRandomSampler(indices[:split]),
    #     pin_memory=True, num_workers=args.num_workers)

    # valid_queue = torch.utils.data.DataLoader(
    #     valid_data, batch_size=args.batch_size,
    #     sampler=torch.utils.data.sampler.SubsetRandomSampler(indices[split:num_train]),
    #     pin_memory=True, num_workers=args.num_workers)
    train_queue, valid_queue = get_dataloaders(args.dataset,
                                               args.batch_size,
                                               args.num_workers,
                                               args.dataroot,
                                               sub_policies,
                                               model.magnitudes,
                                               args.cutout,
                                               args.cutout_length,
                                               split=args.train_portion,
                                               split_idx=0,
                                               target_lb=-1)

    scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(
        optimizer, float(args.epochs), eta_min=args.learning_rate_min)

    architect = Architect(model, args)

    for epoch in range(args.epochs):
        scheduler.step()
        lr = scheduler.get_lr()[0]
        logging.info('epoch %d lr %e', epoch, lr)

        genotype = model.genotype()
        # logging.info('genotype = %s', genotype)
        print_genotype(genotype)
        # logging.info('%s' % str(torch.nn.functional.softmax(model.ops_weights, dim=-1)))
        probs = model.ops_weights
        # logging.info('%s' % str(probs / probs.sum(-1, keepdim=True)))
        logging.info('%s' % str(torch.nn.functional.softmax(probs, dim=-1)))
        logging.info('%s' % str(model.probabilities.clamp(0, 1)))
        logging.info('%s' % str(model.magnitudes.clamp(0, 1)))

        # training
        train_acc, train_obj = train(train_queue, valid_queue, model,
                                     architect, criterion, optimizer, lr)
        logging.info('train_acc %f', train_acc)

        # validation
        valid_acc, valid_obj = infer(valid_queue, model, criterion)
        logging.info('valid_acc %f', valid_acc)

        utils.save(model, os.path.join(args.save, 'weights.pt'))
    end_time = time.time()
    elapsed = end_time - start_time
    logging.info('elapsed time: %.3f Hours' % (elapsed / 3600.))
示例#12
0
def train(cfg: DictConfig) -> None:

    # Determine device (GPU, CPU, etc.)
    device = "cuda" if torch.cuda.is_available() else "cpu"

    # Model
    model = get_network(cfg)

    # Data Loaders
    train_loader, val_loader = get_dataloaders(cfg, num_workers=cfg.data_loader_workers)

    # Your training loop
    trainer = create_training_loop(model, cfg, "trainer", device=device)
    # Your evaluation loop
    evaluator = create_evaluation_loop(model, cfg, "evaluator", device=device)

    ld = LogDirector(cfg, engines=[trainer, evaluator])

    ########################################################################
    # Logging Callbacks
    ########################################################################

    # Helper to run the evaluation loop
    def run_evaluator():
        evaluator.run(val_loader)
        return evaluator  # NOTE: Must return the engine we want to log from

    ld.set_event_handlers(
        trainer,
        Events.ITERATION_COMPLETED(every=50),
        EngineStateAttr.OUTPUT,
        [
            (LOG_OP.SAVE_IMAGE, ["im"]),  # Save images to a folder
            (LOG_OP.LOG_MESSAGE, ["nll"],),  # Log fields as message in logfile
            (LOG_OP.SAVE_IN_DATA_FILE, ["nll"],),  # Log fields as separate data files
            (
                LOG_OP.NUMBER_TO_VISDOM,
                [
                    # First plot, key is "p1"
                    VisPlot(
                        var_name="nll",
                        plot_key="p1",
                        split="nll_1",
                        # Any opts that Visdom supports
                        opts={"title": "Plot 1", "xlabel": "Iters", "fillarea": True},
                    ),
                    VisPlot(var_name="nll_2", plot_key="p1", split="nll_2",),
                ],
            ),
            (
                LOG_OP.IMAGE_TO_VISDOM,
                [
                    VisImg(
                        var_name="im",
                        img_key="1",
                        env="images",
                        opts={"caption": "a current image", "title": "title"},
                    ),
                    VisImg(
                        var_name="im",
                        img_key="2",
                        env="images",
                        opts={"caption": "a current image", "title": "title"},
                    ),
                ],
            ),
        ],
    )

    ld.set_event_handlers(
        trainer,
        Events.EPOCH_COMPLETED,
        EngineStateAttr.METRICS,
        [
            (
                LOG_OP.LOG_MESSAGE,
                ["nll", "accuracy",],
            ),  # Log fields as message in logfile
            (
                LOG_OP.SAVE_IN_DATA_FILE,
                ["accuracy"],
            ),  # Log fields as separate data files
            (
                LOG_OP.NUMBER_TO_VISDOM,
                [
                    # First plot, key is "p1"
                    VisPlot(
                        var_name="accuracy",
                        plot_key="p3",
                        split="acc",
                        # Any opts that Visdom supports
                        opts={"title": "Eval Acc", "xlabel": "Iters"},
                    ),
                    # First plot, key is "p1"
                    VisPlot(
                        var_name="nll",
                        plot_key="p4",
                        split="nll",
                        # Any opts that Visdom supports
                        opts={"title": "Eval Nll", "xlabel": "Iters", "fillarea": True},
                    ),
                ],
            ),
        ],
        # Run the evaluation loop, then do log operations from the return engine
        pre_op=run_evaluator,
    )

    # Execute training
    trainer.run(train_loader, max_epochs=cfg.mode.train.max_epochs)
示例#13
0
def eval_net(**kwargs):
    cuda = kwargs['cuda']
    device = kwargs['device']
    model = kwargs['model']
    model.eval()

    if cuda:
        model.cuda(device=device)
    if 'writer' in kwargs.keys():
        num_classes = kwargs['num_classes']
        batch_size = kwargs['batch_size']
        writer = kwargs['writer']
        step = kwargs['step']
        denominator = kwargs['denominator']
        val_loader = kwargs['val_loader']
        model = kwargs['model']
        criterion = kwargs['criterion']
        net_accuracy, net_loss = [], []
        for idx, data in enumerate(val_loader):
            test_x, label = data['input'], data['label']
            test_x = test_x.cuda() if cuda else test_x
            # forward
            out_x, pred = model.forward(test_x)
            pred = pred.cpu()
            loss = criterion(out_x.cpu(), label)
            # get accuracy metric
            accuracy = (pred == label).sum()
            accuracy = accuracy * 100 / (test_x.size(0)*64**2)
            net_accuracy.append(accuracy)
            net_loss.append(loss.item())

            # per class accuracies
            # avg = []
            # for j in range(num_classes):
            #     class_pred = (pred == j)
            #     class_label = (label == j)
            #     class_accuracy = (class_pred == class_label).sum()
            #     class_accuracy = class_accuracy * 100 / (batch_size * 32 ** 2)
            #     avg.append(class_accuracy)
            #     writer.add_scalar(tag='class_{} accuracy'.format(j), scalar_value=class_accuracy, global_step=step)
            # classes_avg_acc = np.asarray(avg).mean()
            # writer.add_scalar(tag='classes avg. accuracy', scalar_value=classes_avg_acc, global_step=step)

            #################################
        mean_accuracy = np.asarray(net_accuracy).mean()
        mean_loss = np.asarray(net_loss).mean()
        writer.add_scalar(tag='eval accuracy', scalar_value=mean_accuracy, global_step=step)
        writer.add_scalar(tag='eval loss', scalar_value=mean_loss, global_step=step)
        print('$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$')
        print('log: validation:: total loss = {:.5f}, total accuracy = {:.5f}%'.format(mean_loss, mean_accuracy))
        print('$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$')

    else:
        # model, images, labels, pre_model, save_dir, sum_dir, batch_size, lr, log_after, cuda
        pre_model = kwargs['pre_model']
        images = kwargs['images']
        labels = kwargs['labels']
        batch_size = kwargs['batch_size']
        criterion = nn.CrossEntropyLoss()
        cm = CM(k=7, normalized=True)
        preds, labs = torch.Tensor().long(), torch.Tensor().long()

        model.load_state_dict(torch.load(pre_model))
        print('log: resumed model {} successfully!'.format(pre_model))
        _, _, test_loader = get_dataloaders(images_path=images, labels_path=labels, batch_size=batch_size)
        net_accuracy, net_loss = [], []
        net_class_accuracy_0, net_class_accuracy_1, net_class_accuracy_2, \
        net_class_accuracy_3, net_class_accuracy_4, net_class_accuracy_5,\
        net_class_accuracy_6  = [], [], [], [], [], [], []
        for idx, data in enumerate(test_loader):
            if idx == 400:
                break
            test_x, label = data['input'], data['label']
            # print(test_x.shape)
            if cuda:
                test_x = test_x.cuda()
            # forward
            out_x, pred = model.forward(test_x)
            pred = pred.cpu()
            loss = criterion(out_x.cpu(), label)

            # get accuracy metric
            accuracy = (pred == label).sum()
            accuracy = accuracy * 100 / (pred.view(-1).size(0))
            net_accuracy.append(accuracy)
            net_loss.append(loss.item())
            if idx % 10 == 0:
                print('log: on {}'.format(idx))

            # print(pred.view(-1).size(0))
            # get per-class metrics
            class_pred_0 = (pred == 0)
            class_label_0 = (label == 0)
            class_accuracy_0 = (class_pred_0 == class_label_0).sum()
            class_accuracy_0 = class_accuracy_0 * 100 / (pred.view(-1).size(0))
            net_class_accuracy_0.append(class_accuracy_0)

            class_pred_1 = (pred == 1)
            class_label_1 = (label == 1)
            class_accuracy_1 = (class_pred_1 == class_label_1).sum()
            class_accuracy_1 = class_accuracy_1 * 100 / (pred.view(-1).size(0))
            net_class_accuracy_1.append(class_accuracy_1)

            class_pred_2 = (pred == 2)
            class_label_2 = (label == 2)
            class_accuracy_2 = (class_pred_2 == class_label_2).sum()
            class_accuracy_2 = class_accuracy_2 * 100 / (pred.view(-1).size(0))
            net_class_accuracy_2.append(class_accuracy_2)

            class_pred_3 = (pred == 3)
            class_label_3 = (label == 3)
            class_accuracy_3 = (class_pred_3 == class_label_3).sum()
            class_accuracy_3 = class_accuracy_3 * 100 / (pred.view(-1).size(0))
            net_class_accuracy_3.append(class_accuracy_3)

            class_pred_4 = (pred == 4)
            class_label_4 = (label == 4)
            class_accuracy_4 = (class_pred_4 == class_label_4).sum()
            class_accuracy_4 = class_accuracy_4 * 100 / (pred.view(-1).size(0))
            net_class_accuracy_4.append(class_accuracy_4)

            class_pred_5 = (pred == 5)
            class_label_5 = (label == 5)
            class_accuracy_5 = (class_pred_5 == class_label_5).sum()
            class_accuracy_5 = class_accuracy_5 * 100 / (pred.view(-1).size(0))
            net_class_accuracy_5.append(class_accuracy_5)

            class_pred_6 = (pred == 6)
            class_label_6 = (label == 6)
            class_accuracy_6 = (class_pred_6 == class_label_6).sum()
            class_accuracy_6 = class_accuracy_6 * 100 / (pred.view(-1).size(0))
            net_class_accuracy_6.append(class_accuracy_6)

            preds = torch.cat((preds, pred.long().view(-1)))
            labs = torch.cat((labs, label.long().view(-1)))
            #################################
        mean_accuracy = np.asarray(net_accuracy).mean()
        mean_loss = np.asarray(net_loss).mean()

        class_0_mean_accuracy = np.asarray(net_class_accuracy_0).mean()
        class_1_mean_accuracy = np.asarray(net_class_accuracy_1).mean()
        class_2_mean_accuracy = np.asarray(net_class_accuracy_2).mean()
        class_3_mean_accuracy = np.asarray(net_class_accuracy_3).mean()
        class_4_mean_accuracy = np.asarray(net_class_accuracy_4).mean()
        class_5_mean_accuracy = np.asarray(net_class_accuracy_5).mean()
        class_6_mean_accuracy = np.asarray(net_class_accuracy_6).mean()

        print('$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$')
        print('log: test:: total loss = {:.5f}, total accuracy = {:.5f}%'.format(mean_loss, mean_accuracy))
        print('log: class 0:: total accuracy = {:.5f}%'.format(class_0_mean_accuracy))
        print('log: class 1:: total accuracy = {:.5f}%'.format(class_1_mean_accuracy))
        print('log: class 2:: total accuracy = {:.5f}%'.format(class_2_mean_accuracy))
        print('log: class 3:: total accuracy = {:.5f}%'.format(class_3_mean_accuracy))
        print('log: class 4:: total accuracy = {:.5f}%'.format(class_4_mean_accuracy))
        print('log: class 5:: total accuracy = {:.5f}%'.format(class_5_mean_accuracy))
        print('log: class 6:: total accuracy = {:.5f}%'.format(class_6_mean_accuracy))
        print('$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$')

        # class_names = ['background/clutter', 'buildings', 'trees', 'cars',
        #                'low_vegetation', 'impervious_surfaces', 'noise']
        # cm_preds = pred.view(-1).cpu().numpy()
        # cm_labels = label.view(-1).cpu().numpy()
        # cnf_matrix = confusion_matrix(cm_labels, cm_preds)
        #
        # fig1 = plt.figure()
        # plot_confusion_matrix(cnf_matrix, classes=class_names,
        #                       title='Confusion matrix, without normalization')
        # # Plot normalized confusion matrix
        # fig2 = plt.figure()
        # plot_confusion_matrix(cnf_matrix, classes=class_names, normalize=True,
        #                       title='Normalized confusion matrix')
        # fig2img(fig1).save('unnormalized.png')
        # fig2img(fig2).save('normalized.png')

        #
        # cm.add(preds.view(-1), labs.view(-1).type(torch.LongTensor))
        # this = cm.value()
        # print(this)
        # df_cm = pd.DataFrame(this, index=[f for f in class_names],
        #                      columns=[f for f in class_names])
        # fig = plt.figure(figsize=(10, 7))
        # sn.heatmap(df_cm, annot=True)
        # fig2img(fig).save('sea.png')
    pass
示例#14
0
	def build_datasets(self):
		args = self.args
		if args.watershed:
			self.train_loader, self.val_loader = dataset.get_dataloaders(args.batch_size, augment=True, skip_no_lenses_frames=False, watershed_endpoints=WATERSHED_ENDPOINTS)
		else:
			self.train_loader, self.val_loader = dataset.get_classifier_dataloaders(args.batch_size, augment=True)
示例#15
0
def train_net(model, images, labels, pre_model, save_dir, sum_dir,
              batch_size, lr, log_after, cuda, device):
    print(model)
    if cuda:
        print('GPU')
        model.cuda(device=device)
    # define loss and optimizer
    optimizer = RMSprop(model.parameters(), lr=lr)
    criterion = nn.CrossEntropyLoss()
    train_loader, val_dataloader, test_loader = get_dataloaders(images_path=images,
                                                                labels_path=labels,
                                                                batch_size=batch_size)
    if not os.path.exists(save_dir):
        os.mkdir(save_dir)
    if not os.path.exists(sum_dir):
        os.mkdir(sum_dir)
    writer = SummaryWriter()
    if True:
        i = 0
        m_loss, m_accuracy = [], []
        num_classes = 7
        if pre_model:
            # self.load_state_dict(torch.load(pre_model)['model'])
            model.load_state_dict(torch.load(pre_model))
            print('log: resumed model {} successfully!'.format(pre_model))
            model_number = int(pre_model.split('/')[1].split('-')[1].split('.')[0])
        else:
            print('log: starting anew...')
        while True:
            i += 1
            net_loss = []
            net_accuracy = []
            if not pre_model:
                save_path = os.path.join(save_dir, 'model-{}.pt'.format(i))
            else:
                save_path = os.path.join(save_dir, 'model-{}.pt'.format(i+model_number-1))
            if i > 1 and not os.path.exists(save_path):
                torch.save(model.state_dict(), save_path)
                # remember to save only five previous models, so
                del_this = os.path.join(save_dir, 'model-{}.pt'.format(i+model_number-6))
                if os.path.exists(del_this):
                    os.remove(del_this)
                    print('log: removed {}'.format(del_this))
                print('log: saved {}'.format(save_path))
            list_of_pred = []
            list_of_labels = []
            for idx, data in enumerate(train_loader):
                ##########################
                model.train()
                ##########################
                test_x, label = data['input'], data['label']
                image0 = test_x[0]
                test_x = test_x.cuda(device=device) if cuda else test_x
                size = test_x.size(-1)
                # forward
                out_x, pred = model.forward(test_x)
                pred = pred.cpu(); out_x = out_x.cpu()
                image1 = pred[0]
                image2 = label[0]
                if idx % (len(train_loader)/2) == 0:
                    writer.add_image('input', image0, i)
                    writer.add_image('pred', image1, i)
                    writer.add_image('label', image2, i)

                loss = criterion(out_x, label)
                # get accuracy metric
                accuracy = (pred == label).sum()
                # also convert into np arrays to be used for confusion matrix
                pred_np = pred.numpy(); list_of_pred.append(pred_np)
                label_np = label.numpy(); list_of_labels.append(label_np)

                writer.add_scalar(tag='loss', scalar_value=loss.item(), global_step=i)
                writer.add_scalar(tag='over_all accuracy',
                                  scalar_value=accuracy*100/(test_x.size(0)*size**2),
                                  global_step=i)

                # per class accuracies
                avg = []
                for j in range(num_classes):
                    class_pred = (pred == j)
                    class_label = (label == j)
                    class_accuracy = (class_pred == class_label).sum()
                    class_accuracy = class_accuracy * 100 / (test_x.size(0) * size ** 2)
                    avg.append(class_accuracy)
                    writer.add_scalar(tag='class_{} accuracy'.format(j), scalar_value=class_accuracy, global_step=i)
                classes_avg_acc = np.asarray(avg).mean()
                writer.add_scalar(tag='classes avg. accuracy', scalar_value=classes_avg_acc, global_step=i)

                if idx % log_after == 0 and idx > 0:
                    print('{}. ({}/{}) image size = {}, loss = {}: accuracy = {}/{}'.format(i,
                                                                                            idx,
                                                                                            len(train_loader),
                                                                                            out_x.size(),
                                                                                            loss.item(),
                                                                                            accuracy,
                                                                                            test_x.size(0)*size**2))
                #################################
                # three steps for backprop
                model.zero_grad()
                loss.backward()
                # perform gradient clipping between loss backward and optimizer step
                clip_grad_norm_(model.parameters(), 0.05)
                optimizer.step()
                accuracy = accuracy * 100 / (test_x.size(0)*size**2)
                net_accuracy.append(accuracy)
                net_loss.append(loss.item())
                #################################
            mean_accuracy = np.asarray(net_accuracy).mean()
            mean_loss = np.asarray(net_loss).mean()
            m_loss.append((i, mean_loss))
            m_accuracy.append((i, mean_accuracy))
            print('####################################')
            print('epoch {} -> total loss = {:.5f}, total accuracy = {:.5f}%'.format(i, mean_loss, mean_accuracy))
            print('####################################')

            # # one epoch complete, get new confusion matrix!
            # cm_preds = np.vstack(list_of_pred)
            # cm_preds = cm_preds.reshape(-1)
            # cm_labels = np.vstack(list_of_labels)
            # cm_labels = cm_labels.reshape(-1)
            # cnf_matrix = confusion_matrix(cm_labels, cm_preds)
            # fig1 = plt.figure()
            # plot_confusion_matrix(cnf_matrix, classes=class_names,
            #                       title='Confusion matrix, without normalization')
            # # Plot normalized confusion matrix
            # fig2 = plt.figure()
            # plot_confusion_matrix(cnf_matrix, classes=class_names, normalize=True,
            #                       title='Normalized confusion matrix')
            # get1 = np.asarray(fig2img(fig1))
            # get2 = np.asarray(fig2img(fig2))
            # # print(get1.size)


            # validate model
            if i % 10 == 0:
                eval_net(model=model, criterion=criterion, val_loader=val_dataloader,
                         denominator=batch_size * size**2, cuda=cuda, device=device,
                         writer=writer, num_classes=num_classes, batch_size=batch_size, step=i)
    writer.export_scalars_to_json("./all_scalars.json")
    writer.close()

    pass
示例#16
0
文件: predict.py 项目: xianyuzzp/MURA
                        default='results',
                        type=str,
                        help='parent directory to write result')
    parser.add_argument('--phase',
                        default='test',
                        type=str,
                        choices=['valid', 'test'])
    parser.add_argument('-b',
                        '--batch_size',
                        default=16,
                        type=int,
                        help='mini-batch size')
    args = parser.parse_args()

    dataloader = get_dataloaders(args.phase,
                                 batch_size=args.batch_size,
                                 shuffle=False,
                                 data_dir=args.data_dir)
    total_scores = None  # voting scores
    labels = None
    st_corrects = {st: 0 for st in config.study_type}
    nr_stype = {st: 0 for st in config.study_type}

    for j in range(len(model_list)):
        print('single model ' + str(j), model_list[j])

        if 'fuse' in model_list[j]:
            state_dict = torch.load(model_list[j])['state_dict']
            net = fusenet()
            net.load_state_dict(state_dict)
            net.set_fcweights()
            net = torch.nn.DataParallel(net).cuda()
示例#17
0
def eval_net(**kwargs):
    cuda = kwargs['cuda']
    device = kwargs['device']
    model = kwargs['model']
    model.eval()

    if cuda:
        model.cuda(device=device)
    if 'writer' in kwargs.keys():
        num_classes = kwargs['num_classes']
        batch_size = kwargs['batch_size']
        writer = kwargs['writer']
        step = kwargs['step']
        denominator = kwargs['denominator']
        val_loader = kwargs['val_loader']
        model = kwargs['model']
        criterion = kwargs['criterion']
        net_accuracy, net_loss = [], []
        for idx, data in enumerate(val_loader):
            test_x, label = data['input'], data['label']
            test_x = test_x.cuda() if cuda else test_x
            # forward
            out_x, pred = model.forward(test_x)
            pred = pred.cpu()
            loss = criterion(out_x.cpu(), label)
            # get accuracy metric
            accuracy = (pred == label).sum()
            accuracy = accuracy * 100 / (test_x.size(0)*64**2)
            net_accuracy.append(accuracy)
            net_loss.append(loss.item())

            # per class accuracies
            # avg = []
            # for j in range(num_classes):
            #     class_pred = (pred == j)
            #     class_label = (label == j)
            #     class_accuracy = (class_pred == class_label).sum()
            #     class_accuracy = class_accuracy * 100 / (batch_size * 32 ** 2)
            #     avg.append(class_accuracy)
            #     writer.add_scalar(tag='class_{} accuracy'.format(j), scalar_value=class_accuracy, global_step=step)
            # classes_avg_acc = np.asarray(avg).mean()
            # writer.add_scalar(tag='classes avg. accuracy', scalar_value=classes_avg_acc, global_step=step)

            #################################
        mean_accuracy = np.asarray(net_accuracy).mean()
        mean_loss = np.asarray(net_loss).mean()
        writer.add_scalar(tag='eval accuracy', scalar_value=mean_accuracy, global_step=step)
        writer.add_scalar(tag='eval loss', scalar_value=mean_loss, global_step=step)
        print('$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$')
        print('log: validation:: total loss = {:.5f}, total accuracy = {:.5f}%'.format(mean_loss, mean_accuracy))
        print('$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$')

    else:
        # model, images, labels, pre_model, save_dir, sum_dir, batch_size, lr, log_after, cuda
        pre_model = kwargs['pre_model']
        images = kwargs['images']
        labels = kwargs['labels']
        batch_size = kwargs['batch_size']
        criterion = nn.CrossEntropyLoss()
        cm = CM(k=7, normalized=True)
        preds, labs = torch.Tensor().long(), torch.Tensor().long()

        model.load_state_dict(torch.load(pre_model))
        print('log: resumed model {} successfully!'.format(pre_model))
        _, _, test_loader = get_dataloaders(images_path=images, labels_path=labels, batch_size=batch_size)
        net_accuracy, net_loss = [], []
        net_class_accuracy_0, net_class_accuracy_1, net_class_accuracy_2, \
        net_class_accuracy_3, net_class_accuracy_4, net_class_accuracy_5,\
        net_class_accuracy_6  = [], [], [], [], [], [], []
        for idx, data in enumerate(test_loader):
            if idx == 400:
                break
            test_x, label = data['input'], data['label']
            # print(test_x.shape)
            if cuda:
                test_x = test_x.cuda()
            # forward
            out_x, pred = model.forward(test_x)
            pred = pred.cpu()
            loss = criterion(out_x.cpu(), label)

            # get accuracy metric
            accuracy = (pred == label).sum()
            accuracy = accuracy * 100 / (pred.view(-1).size(0))
            net_accuracy.append(accuracy)
            net_loss.append(loss.item())
            if idx % 10 == 0:
                print('log: on {}'.format(idx))

            # print(pred.view(-1).size(0))
            # get per-class metrics
            class_pred_0 = (pred == 0)
            class_label_0 = (label == 0)
            class_accuracy_0 = (class_pred_0 == class_label_0).sum()
            class_accuracy_0 = class_accuracy_0 * 100 / (pred.view(-1).size(0))
            net_class_accuracy_0.append(class_accuracy_0)

            class_pred_1 = (pred == 1)
            class_label_1 = (label == 1)
            class_accuracy_1 = (class_pred_1 == class_label_1).sum()
            class_accuracy_1 = class_accuracy_1 * 100 / (pred.view(-1).size(0))
            net_class_accuracy_1.append(class_accuracy_1)

            class_pred_2 = (pred == 2)
            class_label_2 = (label == 2)
            class_accuracy_2 = (class_pred_2 == class_label_2).sum()
            class_accuracy_2 = class_accuracy_2 * 100 / (pred.view(-1).size(0))
            net_class_accuracy_2.append(class_accuracy_2)

            class_pred_3 = (pred == 3)
            class_label_3 = (label == 3)
            class_accuracy_3 = (class_pred_3 == class_label_3).sum()
            class_accuracy_3 = class_accuracy_3 * 100 / (pred.view(-1).size(0))
            net_class_accuracy_3.append(class_accuracy_3)

            class_pred_4 = (pred == 4)
            class_label_4 = (label == 4)
            class_accuracy_4 = (class_pred_4 == class_label_4).sum()
            class_accuracy_4 = class_accuracy_4 * 100 / (pred.view(-1).size(0))
            net_class_accuracy_4.append(class_accuracy_4)

            class_pred_5 = (pred == 5)
            class_label_5 = (label == 5)
            class_accuracy_5 = (class_pred_5 == class_label_5).sum()
            class_accuracy_5 = class_accuracy_5 * 100 / (pred.view(-1).size(0))
            net_class_accuracy_5.append(class_accuracy_5)

            class_pred_6 = (pred == 6)
            class_label_6 = (label == 6)
            class_accuracy_6 = (class_pred_6 == class_label_6).sum()
            class_accuracy_6 = class_accuracy_6 * 100 / (pred.view(-1).size(0))
            net_class_accuracy_6.append(class_accuracy_6)

            preds = torch.cat((preds, pred.long().view(-1)))
            labs = torch.cat((labs, label.long().view(-1)))
            #################################
        mean_accuracy = np.asarray(net_accuracy).mean()
        mean_loss = np.asarray(net_loss).mean()

        class_0_mean_accuracy = np.asarray(net_class_accuracy_0).mean()
        class_1_mean_accuracy = np.asarray(net_class_accuracy_1).mean()
        class_2_mean_accuracy = np.asarray(net_class_accuracy_2).mean()
        class_3_mean_accuracy = np.asarray(net_class_accuracy_3).mean()
        class_4_mean_accuracy = np.asarray(net_class_accuracy_4).mean()
        class_5_mean_accuracy = np.asarray(net_class_accuracy_5).mean()
        class_6_mean_accuracy = np.asarray(net_class_accuracy_6).mean()

        print('$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$')
        print('log: test:: total loss = {:.5f}, total accuracy = {:.5f}%'.format(mean_loss, mean_accuracy))
        print('log: class 0:: total accuracy = {:.5f}%'.format(class_0_mean_accuracy))
        print('log: class 1:: total accuracy = {:.5f}%'.format(class_1_mean_accuracy))
        print('log: class 2:: total accuracy = {:.5f}%'.format(class_2_mean_accuracy))
        print('log: class 3:: total accuracy = {:.5f}%'.format(class_3_mean_accuracy))
        print('log: class 4:: total accuracy = {:.5f}%'.format(class_4_mean_accuracy))
        print('log: class 5:: total accuracy = {:.5f}%'.format(class_5_mean_accuracy))
        print('log: class 6:: total accuracy = {:.5f}%'.format(class_6_mean_accuracy))
        print('$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$')
示例#18
0
    parser.add_argument('--img_type',
                        default='ALL',
                        type=str,
                        required=False,
                        choices=[
                            'ELBOW', 'FINGER', 'FOREARM', 'HAND', 'HUMERUS',
                            'SHOULDER', 'WRIST', 'ALL'
                        ],
                        help='type of query input')
    args = parser.parse_args()

    net = torch.load(args.model_path)['net']

    if args.generate is True:
        dataloader = get_dataloaders('train',
                                     batch_size=args.batch_size,
                                     shuffle=False)

        generate_database(net, dataloader, args.save_dir)

    database = h5py.File(args.database_path, 'r')

    image = Image.open(args.img_path).convert('RGB')

    top5 = retrieval(image, net, database, args.img_type)
    print("The most similar five are:")
    i = 0
    for path in top5:
        i += 1
        print(path.item())
        path = os.path.join(args.data_dir, str(path.item())[2:-1])
示例#19
0
        target = torch.masked_select(target, target_mask)
        loss = nn.BCEWithLogitsLoss()(output.float(), target.float())
        return {"loss": loss, "output": output, "target": target}

    def validation_step(self, batch, batch_idx):
        inputs, target_ids, target = batch
        output = self(inputs["input_ids"], inputs["input_cat"], target_ids,
                      inputs["input_rtime"])
        target_mask = (target_ids != 0)
        output = torch.masked_select(output.squeeze(), target_mask)
        target = torch.masked_select(target, target_mask)
        loss = nn.BCEWithLogitsLoss()(output.float(), target.float())
        return {"val_loss": loss, "output": output, "target": target}


train_loader, val_loader = get_dataloaders()

ARGS = {
    "n_dims": config.EMBED_DIMS,
    'n_encoder': config.NUM_ENCODER,
    'n_decoder': config.NUM_DECODER,
    'enc_heads': config.ENC_HEADS,
    'dec_heads': config.DEC_HEADS,
    'total_ex': config.TOTAL_EXE,
    'total_cat': config.TOTAL_CAT,
    'total_responses': config.TOTAL_EXE,
    'seq_len': config.MAX_SEQ
}

########### TRAINING AND SAVING MODEL #######
checkpoint = ModelCheckpoint(filename="{epoch}_model",
示例#20
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--epochs', default=100, type=int, help='epoch number')
    parser.add_argument('--start_epoch',
                        default=0,
                        type=int,
                        help='start epoch number')
    parser.add_argument('-b',
                        '--batch_size',
                        default=8,
                        type=int,
                        help='mini-batch size')
    parser.add_argument('--lr',
                        '--learning_rate',
                        default=1e-4,
                        type=float,
                        help='initial learning rate')
    parser.add_argument('--weight-decay',
                        default=0.0,
                        type=float,
                        help='weight decay')
    parser.add_argument('-c',
                        '--continue',
                        dest='continue_path',
                        type=str,
                        required=False)
    parser.add_argument('--exp_name',
                        default=config.exp_name,
                        type=str,
                        required=False)
    args = parser.parse_args()
    print(args)

    config.exp_name = args.exp_name
    config.make_dir()

    save_args(args, config.log_dir)
    net = network()

    net = torch.nn.DataParallel(net).cuda()
    sess = Session(config, net=net)

    train_loader = get_dataloaders(os.path.join(config.data_dir, 'train.json'),
                                   batch_size=args.batch_size,
                                   shuffle=True)
    valid_loader = get_dataloaders(os.path.join(config.data_dir, 'val.json'),
                                   batch_size=args.batch_size,
                                   shuffle=True)

    if args.continue_path and os.path.exists(args.continue_path):
        sess.load_checkpoint(args.continue_path)

    clock = sess.clock
    tb_writer = sess.tb_writer

    criterion = nn.L1Loss().cuda()

    optimizer = optim.Adam(sess.net.parameters(),
                           args.lr,
                           weight_decay=args.weight_decay)

    scheduler = ReduceLROnPlateau(optimizer,
                                  'min',
                                  factor=0.1,
                                  patience=10,
                                  verbose=True)

    for e in range(args.epochs):
        train_model(train_loader, sess.net, criterion, optimizer, clock.epoch,
                    tb_writer)
        valid_out = valid_model(valid_loader, sess.net, criterion, clock.epoch,
                                tb_writer)
        tb_writer.add_scalar('train/learning_rate',
                             optimizer.param_groups[-1]['lr'], clock.epoch)
        scheduler.step(valid_out['epoch_loss'])

        if valid_out['epoch_loss'] < sess.best_val_loss:
            sess.best_val_loss = valid_out['epoch_loss']
            sess.save_checkpoint('best_model.pth.tar')

        if clock.epoch % 10 == 0:
            sess.save_checkpoint('epoch{}.pth.tar'.format(clock.epoch))
        sess.save_checkpoint('latest.pth.tar')

        clock.tock()
def train_net(model, base_folder, pre_model, save_dir, batch_size, lr,
              log_after, cuda, device):
    if not pre_model:
        print(model)
    writer = SummaryWriter()
    if cuda:
        print('GPU')
        model.cuda(device=device)
        print('log: training started on device: {}'.format(device))
    # define loss and optimizer
    optimizer = Adam(model.parameters(), lr=lr)
    criterion = nn.CrossEntropyLoss()
    train_loader, val_dataloader, test_loader = get_dataloaders(
        base_folder=base_folder, batch_size=batch_size)
    if not os.path.exists(save_dir):
        os.mkdir(save_dir)

    if True:
        i = 1
        m_loss, m_accuracy = [], []
        if pre_model:
            # self.load_state_dict(torch.load(pre_model)['model'])
            model.load_state_dict(torch.load(pre_model))
            print('log: resumed model {} successfully!'.format(pre_model))
            print(model)

            # starting point
            # model_number = int(pre_model.split('/')[1].split('-')[1].split('.')[0])
            model_number = int(re.findall('\d+', str(pre_model))[0])
            i = i + model_number - 1
        else:
            print('log: starting anew using ImageNet weights...')

        while True:
            i += 1
            net_loss = []
            # new model path
            save_path = os.path.join(save_dir, 'model-{}.pt'.format(i))
            # remember to save only five previous models, so
            del_this = os.path.join(save_dir, 'model-{}.pt'.format(i - 6))
            if os.path.exists(del_this):
                os.remove(del_this)
                print('log: removed {}'.format(del_this))

            if i > 1 and not os.path.exists(save_path):
                torch.save(model.state_dict(), save_path)
                print('log: saved {}'.format(save_path))

            correct_count, total_count = 0, 0
            for idx, data in enumerate(train_loader):
                ##########################
                model.train()  # train mode at each epoch, just in case...
                ##########################
                test_x, label = data['input'], data['label']
                if cuda:
                    test_x = test_x.cuda(device=device)
                    label = label.cuda(device=device)
                # forward
                out_x, pred = model.forward(test_x)
                # out_x, pred = out_x.cpu(), pred.cpu()
                loss = criterion(out_x, label)
                net_loss.append(loss.item())

                # get accuracy metric
                batch_correct = (label.eq(pred.long())).double().sum().item()
                correct_count += batch_correct
                # print(batch_correct)
                total_count += np.float(pred.size(0))
                if idx % log_after == 0 and idx > 0:
                    print(
                        '{}. ({}/{}) image size = {}, loss = {}: accuracy = {}/{}'
                        .format(i, idx, len(train_loader), out_x.size(),
                                loss.item(), batch_correct, pred.size(0)))
                #################################
                # three steps for backprop
                model.zero_grad()
                loss.backward()
                # perform gradient clipping between loss backward and optimizer step
                clip_grad_norm_(model.parameters(), 0.05)
                optimizer.step()
                #################################
            mean_accuracy = correct_count / total_count * 100
            mean_loss = np.asarray(net_loss).mean()
            m_loss.append((i, mean_loss))
            m_accuracy.append((i, mean_accuracy))

            writer.add_scalar(tag='train loss',
                              scalar_value=mean_loss,
                              global_step=i)
            writer.add_scalar(tag='train over_all accuracy',
                              scalar_value=mean_accuracy,
                              global_step=i)

            print('####################################')
            print('epoch {} -> total loss = {:.5f}, total accuracy = {:.5f}%'.
                  format(i, mean_loss, mean_accuracy))
            print('####################################')

            # validate model after each epoch
            eval_net(model=model,
                     writer=writer,
                     criterion=criterion,
                     val_loader=val_dataloader,
                     denominator=batch_size,
                     cuda=cuda,
                     device=device,
                     global_step=i)
    pass
示例#22
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--seed', type=int, default=42, help='Random seed')

    parser.add_argument('-dd',
                        '--data-dir',
                        type=str,
                        default='data',
                        help='Data directory')

    parser.add_argument('-l',
                        '--loss',
                        type=str,
                        default='label_smooth_cross_entropy')
    parser.add_argument('-t1', '--temper1', type=float, default=0.2)
    parser.add_argument('-t2', '--temper2', type=float, default=4.0)
    parser.add_argument('-optim', '--optimizer', type=str, default='adam')

    parser.add_argument('-prep', '--prep_function', type=str, default='none')

    parser.add_argument('--train_on_different_datasets', action='store_true')
    parser.add_argument('--use-current', action='store_true')
    parser.add_argument('--use-extra', action='store_true')
    parser.add_argument('--use-unlabeled', action='store_true')

    parser.add_argument('--fast', action='store_true')
    parser.add_argument('--mixup', action='store_true')
    parser.add_argument('--balance', action='store_true')
    parser.add_argument('--balance-datasets', action='store_true')

    parser.add_argument('--show', action='store_true')
    parser.add_argument('-v', '--verbose', action='store_true')

    parser.add_argument('-m',
                        '--model',
                        type=str,
                        default='efficientnet-b4',
                        help='')
    parser.add_argument('-b',
                        '--batch-size',
                        type=int,
                        default=8,
                        help='Batch Size during training, e.g. -b 64')
    parser.add_argument('-e',
                        '--epochs',
                        type=int,
                        default=100,
                        help='Epoch to run')
    parser.add_argument('-s',
                        '--sizes',
                        default=380,
                        type=int,
                        help='Image size for training & inference')
    parser.add_argument('-f', '--fold', type=int, default=None)
    parser.add_argument('-t', '--transfer', default=None, type=str, help='')
    parser.add_argument('-lr',
                        '--learning_rate',
                        type=float,
                        default=1e-4,
                        help='Initial learning rate')
    parser.add_argument('-a',
                        '--augmentations',
                        default='medium',
                        type=str,
                        help='')
    parser.add_argument('-accum', '--accum-step', type=int, default=1)
    parser.add_argument('-metric', '--metric', type=str, default='accuracy01')

    args = parser.parse_args()

    diff_dataset_train = args.train_on_different_datasets

    data_dir = args.data_dir
    epochs = args.epochs
    batch_size = args.batch_size
    seed = args.seed

    loss_name = args.loss
    optim_name = args.optimizer

    prep_function = args.prep_function

    model_name = args.model
    size = args.sizes,
    print(size)
    print(size[0])
    image_size = (size[0], size[0])
    print(image_size)
    fast = args.fast
    fold = args.fold
    mixup = args.mixup
    balance = args.balance
    balance_datasets = args.balance_datasets
    show_batches = args.show
    verbose = args.verbose
    use_current = args.use_current
    use_extra = args.use_extra
    use_unlabeled = args.use_unlabeled

    learning_rate = args.learning_rate
    augmentations = args.augmentations
    transfer = args.transfer
    accum_step = args.accum_step

    #cosine_loss    accuracy01
    main_metric = args.metric

    print(data_dir)

    num_classes = 5

    assert use_current or use_extra

    print(fold)

    current_time = datetime.now().strftime('%b%d_%H_%M')
    random_name = get_random_name()

    current_time = datetime.now().strftime('%b%d_%H_%M')
    random_name = get_random_name()

    # if folds is None or len(folds) == 0:
    #     folds = [None]

    torch.cuda.empty_cache()
    checkpoint_prefix = f'{model_name}_{size}_{augmentations}'

    if transfer is not None:
        checkpoint_prefix += '_pretrain_from_' + str(transfer)
    else:
        if use_current:
            checkpoint_prefix += '_current'
        if use_extra:
            checkpoint_prefix += '_extra'
        if use_unlabeled:
            checkpoint_prefix += '_unlabeled'
        if fold is not None:
            checkpoint_prefix += f'_fold{fold}'

    directory_prefix = f'{current_time}_{checkpoint_prefix}'
    log_dir = os.path.join('runs', directory_prefix)
    os.makedirs(log_dir, exist_ok=False)

    set_manual_seed(seed)
    model = get_model(model_name)

    if transfer is not None:
        print("Transfering weights from model checkpoint")
        model.load_state_dict(torch.load(transfer)['model_state_dict'])

    model = model.cuda()

    if diff_dataset_train:
        train_on = ['current_train', 'extra_train']
        valid_on = ['unlabeled']
        train_ds, valid_ds, train_sizes = get_datasets_universal(
            train_on=train_on,
            valid_on=valid_on,
            image_size=image_size,
            augmentation=augmentations,
            target_dtype=int,
            prep_function=prep_function)
    else:
        train_ds, valid_ds, train_sizes = get_datasets(
            data_dir=data_dir,
            use_current=use_current,
            use_extra=use_extra,
            image_size=image_size,
            prep_function=prep_function,
            augmentation=augmentations,
            target_dtype=int,
            fold=fold,
            folds=5)

    train_loader, valid_loader = get_dataloaders(train_ds,
                                                 valid_ds,
                                                 batch_size=batch_size,
                                                 train_sizes=train_sizes,
                                                 num_workers=6,
                                                 balance=True,
                                                 balance_datasets=True,
                                                 balance_unlabeled=False)

    loaders = collections.OrderedDict()
    loaders["train"] = train_loader
    loaders["valid"] = valid_loader

    runner = SupervisedRunner(input_key='image')

    criterions = get_loss(loss_name)
    # criterions_tempered = TemperedLogLoss()
    # optimizer = catalyst.contrib.nn.optimizers.radam.RAdam(model.parameters(), lr = learning_rate)
    optimizer = get_optim(optim_name, model, learning_rate)
    # optimizer = catalyst.contrib.nn.optimizers.Adam(model.parameters(), lr = learning_rate)
    # criterions = nn.CrossEntropyLoss()
    # optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
    # scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, milestones=[25], gamma=0.8)
    # cappa = CappaScoreCallback()

    Q = math.floor(len(train_ds) / batch_size)
    scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=Q)
    if main_metric != 'accuracy01':
        callbacks = [
            AccuracyCallback(num_classes=num_classes),
            CosineLossCallback(),
            OptimizerCallback(accumulation_steps=accum_step),
            CheckpointCallback(save_n_best=epochs)
        ]
    else:
        callbacks = [
            AccuracyCallback(num_classes=num_classes),
            OptimizerCallback(accumulation_steps=accum_step),
            CheckpointCallback(save_n_best=epochs)
        ]

    # main_metric = 'accuracy01'

    runner.train(
        fp16=True,
        model=model,
        criterion=criterions,
        optimizer=optimizer,
        scheduler=scheduler,
        callbacks=callbacks,
        loaders=loaders,
        logdir=log_dir,
        num_epochs=epochs,
        verbose=verbose,
        main_metric=main_metric,
        minimize_metric=False,
    )
示例#23
0
def main():

    parser = argparse.ArgumentParser()

    parser.add_argument('--epochs',
                        default=200,
                        type=int,
                        help='epoch number(Default:200)')
    parser.add_argument('--start_epoch',
                        default=0,
                        type=int,
                        help='start epoch number')
    parser.add_argument('-b',
                        '--batch_size',
                        default=6,
                        type=int,
                        help='mini-batch size(Default:6)')
    parser.add_argument('--lr',
                        '--learning_rate',
                        default=1e-3,
                        type=float,
                        help='initial learning rate(Default:1e-3)')
    parser.add_argument('--resume',
                        type=str,
                        default=None,
                        help='The path for checkpoint file')
    parser.add_argument('--exp',
                        type=str,
                        default='test',
                        help='The name of this exp')

    parser.add_argument('--content',
                        type=float,
                        default=10.0,
                        help='the weight of content loss(Default:10.0)')
    parser.add_argument('--tv',
                        type=float,
                        default=3e-3,
                        help='the weight of TV loss(Default:0.003)')
    parser.add_argument('--adv',
                        type=float,
                        default=3.00,
                        help='the weight of adv loss(Default:1.0)')

    parser.add_argument('--first_stage',
                        type=str,
                        default='./exps/2_baseline/checkpoint.pth.tar',
                        help='first stage model')

    args = parser.parse_args()
    base_dir = './twostageExps/'
    exp_dir = os.path.join(base_dir, args.exp)
    base_results_dir = os.path.join(exp_dir, 'results/')
    best_metric = 0
    if not os.path.exists(base_dir):
        os.mkdir(base_dir)
    if not os.path.exists(exp_dir):
        os.mkdir(exp_dir)
    if not os.path.exists(base_results_dir):
        os.mkdir(base_results_dir)

    save_args(args, exp_dir)

    global AdLossWeight
    global TvLossWeight
    global ContentLossWeight
    AdLossWeight = args.adv
    TvLossWeight = args.tv
    ContentLossWeight = args.content
    log_dir = os.path.join('./twostageLogs', args.exp)

    writer = SummaryWriter(log_dir)

    first_stage = network()
    generator = network()
    critic = critic_network()
    optimizer_G = optim.Adam(generator.parameters(), args.lr)
    optimizer_C = optim.Adam(critic.parameters(), args.lr)
    scheduler_C = ReduceLROnPlateau(optimizer_G,
                                    'min',
                                    factor=0.2,
                                    patience=10,
                                    verbose=True)
    scheduler_G = ReduceLROnPlateau(optimizer_C,
                                    'min',
                                    factor=0.2,
                                    patience=10,
                                    verbose=True)

    if args.resume != None:
        assert os.path.exists(args.resume), 'model does not exist!'
        print('=> loading checkpoint {}'.format(args.resume))
        checkpoint = torch.load(args.resume)
        args.start_epoch = checkpoint['epoch']
        generator.load_state_dict(checkpoint['generator'])
        critic.load_state_dict(checkpoint['critic'])
        #best_metric = checkpoint['best_metric']
        optimizer_G = checkpoint['optimizer_G']
        optimizer_C = checkpoint['optimizer_C']

        print('=> loaded checkpoint {} - epoch:{} - best_metric:{}'.format(
            args.resume, args.start_epoch, best_metric))
    else:
        print('No checkpoint. A new begining')

    if args.first_stage != None:
        assert os.path.exists(
            args.first_stage), 'first stage model does not exist!'
        print('=> loading first stage model {}'.format(args.first_stage))
        first_stage_checkpoint = torch.load(args.first_stage)
        first_stage.load_state_dict(first_stage_checkpoint['generator'])

        for p in first_stage.parameters():
            p.requires_grad = False

    vgg_for_perceptual_loss = vgg19()
    for p in vgg_for_perceptual_loss.parameters():
        p.requires_grad = False

    generator.cuda()
    critic.cuda()
    vgg_for_perceptual_loss.cuda()
    vgg_for_perceptual_loss.eval()
    first_stage.cuda()
    first_stage.eval()
    clock = TrainClock()
    clock.epoch = args.start_epoch
    data_dir = config.data_dir
    train_loader = get_dataloaders(os.path.join(data_dir, 'train.json'),
                                   batch_size=args.batch_size,
                                   shuffle=True)
    valid_loader = get_dataloaders(os.path.join(config.data_dir, 'val.json'),
                                   batch_size=args.batch_size,
                                   shuffle=True)
    print('Begin training')

    for epoch in range(args.start_epoch, args.epochs):

        results_dir = os.path.join(base_results_dir, '{}'.format(epoch))
        if not os.path.exists(results_dir):
            os.mkdir(results_dir)

        train(first_stage, generator, critic, optimizer_G, optimizer_C,
              train_loader, vgg_for_perceptual_loss, clock, writer, 2)

        save_checkpoint(
            {
                'epoch': clock.epoch,
                'generator': generator.state_dict(),
                'critic': critic.state_dict(),
                'optimizer_G': optimizer_G,
                'optimizer_C': optimizer_C,
            },
            is_best=True,
            prefix=exp_dir)
        torch.cuda.empty_cache()
        test_on_benchmark_two_stage(first_stage, generator, results_dir)
        torch.cuda.empty_cache()
        CriticRealLoss, ContentLoss = evaluate_on_val_two_stage(
            first_stage, generator, critic, valid_loader,
            vgg_for_perceptual_loss, clock, writer,
            os.path.join(exp_dir, 'valresults.txt'))
        scheduler_C.step(CriticRealLoss)
        scheduler_G.step(ContentLoss)
        torch.cuda.empty_cache()
示例#24
0
文件: train.py 项目: lvxingvir/MURA
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--epochs', default=50, type=int, help='epoch number')
    parser.add_argument('-b',
                        '--batch_size',
                        default=256,
                        type=int,
                        help='mini-batch size')
    parser.add_argument('--lr',
                        '--learning_rate',
                        default=1e-3,
                        type=float,
                        help='initial learning rate')
    parser.add_argument('-c',
                        '--continue',
                        dest='continue_path',
                        type=str,
                        required=False)
    parser.add_argument('--exp_name',
                        default=config.exp_name,
                        type=str,
                        required=False)
    parser.add_argument('--drop_rate', default=0, type=float, required=False)
    parser.add_argument('--only_fc',
                        action='store_true',
                        help='only train fc layers')
    parser.add_argument('--net',
                        default='densenet169',
                        type=str,
                        required=False)
    parser.add_argument('--local',
                        action='store_true',
                        help='train local branch')
    args = parser.parse_args()
    args.batch_size = 32
    args.epochs = 150
    args.net = 'densenet169'
    print(args)

    config.exp_name = args.exp_name
    config.make_dir()
    save_args(args, config.log_dir)

    # get network
    if args.net == 'resnet50':
        net = resnet50(pretrained=True, drop_rate=args.drop_rate)
    elif args.net == 'resnet101':
        net = resnet101(pretrained=True, drop_rate=args.drop_rate)
    elif args.net == 'densenet121':
        net = models.densenet121(pretrained=True)
        net.classifier = nn.Sequential(nn.Linear(1024, 1), nn.Sigmoid())
    elif args.net == 'densenet169':
        net = densenet169(pretrained=True, drop_rate=args.drop_rate)
    elif args.net == 'fusenet':
        global_branch = torch.load(GLOBAL_BRANCH_DIR)['net']
        local_branch = torch.load(LOCAL_BRANCH_DIR)['net']
        net = fusenet(global_branch, local_branch)
        del global_branch, local_branch
    else:
        raise NameError

    net = net.cuda()
    sess = Session(config, net=net)

    # get dataloader
    # train_loader = get_dataloaders('train', batch_size=args.batch_size,
    #                                shuffle=True, is_local=args.local)
    #
    # valid_loader = get_dataloaders('valid', batch_size=args.batch_size,
    #                                shuffle=False, is_local=args.local)
    train_loader = get_dataloaders('train',
                                   batch_size=args.batch_size,
                                   num_workers=4,
                                   shuffle=True)

    valid_loader = get_dataloaders('valid',
                                   batch_size=args.batch_size,
                                   shuffle=False)

    if args.continue_path and os.path.exists(args.continue_path):
        sess.load_checkpoint(args.continue_path)

    # start session
    clock = sess.clock
    tb_writer = sess.tb_writer
    sess.save_checkpoint('start.pth.tar')

    # set criterion, optimizer and scheduler
    criterion = nn.BCELoss().cuda()  # not used

    if args.only_fc:
        optimizer = optim.Adam(sess.net.module.classifier.parameters(),
                               args.lr)
    else:
        optimizer = optim.Adam(sess.net.parameters(), args.lr)

    scheduler = ReduceLROnPlateau(optimizer,
                                  'max',
                                  factor=0.1,
                                  patience=10,
                                  verbose=True)

    # start training
    for e in range(args.epochs):
        train_out = train_model(train_loader, sess.net, criterion, optimizer,
                                clock.epoch)
        valid_out = valid_model(valid_loader, sess.net, criterion, optimizer,
                                clock.epoch)

        tb_writer.add_scalars('loss', {
            'train': train_out['epoch_loss'],
            'valid': valid_out['epoch_loss']
        }, clock.epoch)

        tb_writer.add_scalars('acc', {
            'train': train_out['epoch_acc'],
            'valid': valid_out['epoch_acc']
        }, clock.epoch)

        tb_writer.add_scalar('auc', valid_out['epoch_auc'], clock.epoch)

        tb_writer.add_scalar('learning_rate', optimizer.param_groups[-1]['lr'],
                             clock.epoch)
        scheduler.step(valid_out['epoch_auc'])

        if valid_out['epoch_auc'] > sess.best_val_acc:
            sess.best_val_acc = valid_out['epoch_auc']
            sess.save_checkpoint('best_model.pth.tar')

        if clock.epoch % 10 == 0:
            sess.save_checkpoint('epoch{}.pth.tar'.format(clock.epoch))
        sess.save_checkpoint('latest.pth.tar')

        clock.tock()
def eval_net(**kwargs):
    model = kwargs['model']
    cuda = kwargs['cuda']
    device = kwargs['device']
    if cuda:
        model.cuda(device=device)
    if 'criterion' in kwargs.keys():
        writer = kwargs['writer']
        val_loader = kwargs['val_loader']
        criterion = kwargs['criterion']
        global_step = kwargs['global_step']
        net_loss = []
        model.eval()  # put in eval mode first ############################
        for idx, data in enumerate(val_loader, 1):
            test_x, label = data['input'].unsqueeze(2), data['label']
        # test_x, label = data['input'].unsqueeze(2), data['label']
        if cuda:
            test_x = test_x.cuda(device=device)
            label = label.cuda(device=device)
        # forward
        out_x, h_n = model.continuous_forward(test_x, out_seq_len=250000)
        # print(series_out.shape, series_in.shape)
        loss = criterion(out_x, label)
        net_loss.append(loss.item())
        #################################
        mean_loss = np.asarray(net_loss).sum() / idx
        # summarize mean accuracy
        writer.add_scalar(tag='val_loss',
                          scalar_value=mean_loss,
                          global_step=global_step)
        print('$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$`$$$$$$$$$$$')
        print('in_shape = {}, out_shape = {}'.format(test_x.shape,
                                                     out_x.shape))
        print('log: validation:: total loss = {:.5f}'.format(mean_loss))
        print('$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$')
        if mean_loss:
            ref = test_x[0, :].squeeze(1)  #.numpy()
            this = label[0, :]  #.numpy()
            that = out_x[0, :]  #.numpy()
            this = np.hstack(
                (ref, this)).astype(np.float)  #/100.0 # rescale to best fit
            that = np.hstack((ref, that)).astype(np.float)  #/100.0
            fig = pl.figure()
            pl.plot(this, label='series_in')
            pl.plot(that, label='series_out')
            out = pl.legend(loc='lower right')
            pl.savefig('temp.png')
            evaluated_image = cv2.imread('temp.png')
            # os.remove('eval.png')
            # put it into the summary writer
            evaluated_image = torch.Tensor(evaluated_image.transpose(2, 0, 1))
            writer.add_image('evaluation', evaluated_image, global_step)
            # pl.show()
    else:
        # model, images, labels, pre_model, save_dir, sum_dir, batch_size, lr, log_after, cuda
        pre_model = kwargs['pre_model']
        base_folder = kwargs['base_folder']
        batch_size = kwargs['batch_size']
        log_after = kwargs['log_after']
        criterion = nn.CrossEntropyLoss()
        un_confusion_meter = tnt.meter.ConfusionMeter(10, normalized=False)
        confusion_meter = tnt.meter.ConfusionMeter(10, normalized=True)
        model.load_state_dict(torch.load(pre_model))
        print('log: resumed model {} successfully!'.format(pre_model))
        _, _, test_loader = get_dataloaders(base_folder=base_folder,
                                            batch_size=batch_size)
        net_accuracy, net_loss = [], []
        correct_count = 0
        total_count = 0
        for idx, data in enumerate(test_loader):
            model.eval()  # put in eval mode first
            test_x, label = data['input'], data['label']
            # print(test_x)
            # print(test_x.shape)
            # this = test_x.numpy().squeeze(0).transpose(1,2,0)
            # print(this.shape, np.min(this), np.max(this))
            if cuda:
                test_x = test_x.cuda(device=device)
                label = label.cuda(device=device)
            # forward
            out_x, pred = model.forward(test_x)
            loss = criterion(out_x, label)
            un_confusion_meter.add(predicted=pred, target=label)
            confusion_meter.add(predicted=pred, target=label)

            ###############################
            # pred = pred.view(-1)
            # pred = pred.cpu().numpy()
            # label = label.cpu().numpy()
            # print(pred.shape, label.shape)

            ###############################
            # get accuracy metric
            # correct_count += np.sum((pred == label))
            # print(pred, label)
            batch_correct = (label.eq(pred.long())).double().sum().item()
            correct_count += batch_correct
            # print(batch_correct)
            total_count += np.float(batch_size)
            net_loss.append(loss.item())
            if idx % log_after == 0:
                print('log: on {}'.format(idx))

            #################################
        mean_loss = np.asarray(net_loss).sum()
        mean_accuracy = correct_count * 100 / total_count
        print(correct_count, total_count)
        print('$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$')
        print(
            'log: test:: total loss = {:.5f}, total accuracy = {:.5f}%'.format(
                mean_loss, mean_accuracy))
        print('$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$')
        with open('normalized.pkl', 'wb') as this:
            pkl.dump(confusion_meter.value(),
                     this,
                     protocol=pkl.HIGHEST_PROTOCOL)

        with open('un_normalized.pkl', 'wb') as this:
            pkl.dump(un_confusion_meter.value(),
                     this,
                     protocol=pkl.HIGHEST_PROTOCOL)

        pass
    pass
示例#26
0
def tune_hyper(epoch=10, deeplab=False):
    """Tune hyper-parameters for alternative values."""
    
    # All combinations of values here will be tried in a loop
    learning_rate = np.logspace(-3.4, -4.2, 5)
    try_depth = [34]
    random_vflip = [True]
    random_hflip = [True]
    random_rotate = [True]
    random_transform = [True]
    customs = [True, False]
    
    # Model result info
    model_paths = []
    model_best_f1 = []  
    model_vflip = []
    model_hflip = []
    model_transform = []
    model_rotate = []
    model_depth = []
    model_lr = []
    model_custom = []
    
    # Tuning loop
    for md in try_depth:
        for rv in random_vflip:
            for rh in random_hflip:
                for rr in random_rotate:
                    for rt in random_transform:
                        for lr in learning_rate:
                            for custom in customs:
                                # Getting dataset loaders
                                dataloaders = get_dataloaders(
                                    random_transform=rt, 
                                    random_rotate=rr,
                                    random_hflip=rh, 
                                    random_vflip=rv,
                                    all_in=not custom
                                )
                                
                                print(f'Testing for learning rate {lr}')
                                
                                if deeplab:
                                    model = deeplabv3_resnet101(num_classes=1, pretrained=False)
                                else:
                                    model = UNet(n_channels=3, n_classes=1, depth=md)

                                model, epoch_stats, path, best_f1 = train_model(
                                    model, dataloaders, 
                                    num_epochs=epoch, learning_rate=lr, 
                                    deeplab=deeplab
                                )

                                model_paths.append(path)
                                model_best_f1.append(best_f1)
                                model_vflip.append(rv)
                                model_hflip.append(rh)
                                model_transform.append(rt)
                                model_rotate.append(rr)
                                model_depth.append(md)
                                model_lr.append(lr)
                                model_custom.append(custom)

                                print()

    # Collecting in dictionary for a DataFrame
    tune_results = {
        'learning_rate': model_lr,
        'model_depth': model_depth,
        'model_paths': model_paths,
        'random_vflip': model_vflip,
        'random_hflip': model_hflip,
        'random_rotate': model_rotate,
        'random_transform': model_transform,
        'custom': model_custom, 
        'best_f1': model_best_f1
    }
    
    # Saving tuning results for later inspection
    tune_name = time.strftime("%Y%m%d-%H%M%S") + '.tune' 
    tune_path = os.path.join(TUNE_ROOT, tune_name)
    pickle.dump(tune_results, open(tune_path, 'wb'))
    
    print(tune_path)
    print('Tuning complete')
    
    return pd.DataFrame(tune_results), tune_path
def train(args):
    global img, tgt_caption
    SEED_EVERYTHING()
    batch_size = args.batch_size
    epochs = args.epochs
    device = torch.device(args.device)
    train_dataloader, valid_dataloader = get_dataloaders(batch_size)
    
    with open('config.json','r') as f:
        model_config = json.load(f)
    
    model = get_model(**model_config) #Seq2SeqModel(dropout_p=0.25, hidden_size=256,num_layers=1)
    model.to(device)
#    for param in model.encoder.parameters():
#        param.requires_grad = False
    print(model)
    print(model.decoder.embedding.weight.requires_grad)
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, factor=0.7, patience=2, verbose=True, min_lr=1e-6, mode='max')
    
    if args.resume_from is not None:
        state = torch.load(args.resume_from)
        model.load_state_dict(state['model_state_dict'])
        optimizer.load_state_dict(state['optimizer_state_dict'])
        scheduler.load_state_dict(state['scheduler_state_dict'])
    
    if args.mixed_precision:
        scaler = torch.cuda.amp.GradScaler()
    
    loss_func = nn.CrossEntropyLoss(ignore_index=args.padding_idx)
    best_bleu = 0
    for epoch_i in range(epochs):         
        loss_meter = AverageMeter()
        bleu_meter = AverageMeter()
        pbar = tqdm(train_dataloader, total=len(train_dataloader))
        model.train() 
        for step, batch in enumerate(pbar):
            img = batch[0].to(device)
            tgt_caption = batch[1].to(device)
            
            optimizer.zero_grad()      
            if args.mixed_precision:
                with torch.cuda.amp.autocast():
                    outputs = model(img, tgt_caption)
                    loss = loss_func(outputs.view(-1, args.padding_idx), tgt_caption[1:].view(-1))
                scaler.scale(loss).backward() 
                scaler.unscale_(optimizer)
                torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
                scaler.step(optimizer)      
                scaler.update()
            else:
                outputs = model(img, tgt_caption)
                loss = loss_func(outputs.view(-1, args.padding_idx), tgt_caption[1:].view(-1))
                loss.backward()
                torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
                optimizer.step()
            
            pred_captions = outputs.argmax(2).cpu().numpy()
            true_captions = batch[1][1:].numpy()
            
            bleu = calc_bleu_score(true_captions, pred_captions)
           
            loss_meter.update(loss.item())
            bleu_meter.update(bleu)
            
            pbar.set_postfix({'loss':loss_meter.avg, 'bleu':bleu_meter.avg})
          
        valid_loss, valid_bleu = evaluate(model, valid_dataloader, device, epoch_i, args.key, loss_func)
        scheduler.step(valid_bleu)    
       
        if valid_bleu > best_bleu:
          print('validation bleu improved from %.4f to %.4f'%(best_bleu,valid_loss))
          print('saving model...')
          torch.save({'model_state_dict':model.state_dict(),
                      'optimizer_state_dict':optimizer.state_dict(),
                      'scheduler_state_dict':scheduler.state_dict()}, f'saved_models/{args.key}/state.pth')
          
          best_bleu = valid_bleu
    
        print(f'Epoch: {epoch_i+1}/{epochs}, train loss:{loss_meter.avg:.4f}, train bleu:{bleu_meter.avg:.4f}\nvalid loss: {valid_loss:.4f}, valid bleu: {valid_bleu:.4f}')
        torch.cuda.empty_cache()
示例#28
0
def setup_and_run_train(n_channels, n_classes, dir_img, dir_gt, dir_results, load, 
                val_perc, batch_size, epochs, lr, run, optimizer, loss, evaluation, dir_weights):
    
    # Use GPU or not
    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")

    # Create the model
    net = UNet(n_channels, n_classes).to(device)
    net = torch.nn.DataParallel(net, device_ids=list(
        range(torch.cuda.device_count()))).to(device)

    # Load old weights
    if load:
        net.load_state_dict(torch.load(load))
        print('Model loaded from {}'.format(load))

    # Load the dataset
    if loss != "WCE":
        train_loader, val_loader = get_dataloaders(dir_img, dir_gt, val_perc, batch_size)
    else:
        train_loader, val_loader = get_dataloaders(dir_img, dir_gt, val_perc, batch_size, isWCE = True, dir_weights = dir_weights)

    # Pretty print of the run
    print('''\n
    Starting training:
        Dataset: {}
        Num Channels: {}
        Groundtruth: {}
        Num Classes: {}
        Folder to save: {}
        Load previous: {}
        Training size: {}
        Validation size: {}
        Validation Percentage: {}
        Batch size: {}
        Epochs: {}
        Learning rate: {}
        Optimizer: {}
        Loss Function: {}
        Evaluation Function: {}
        CUDA: {}
    '''.format(dir_img, n_channels, dir_gt, n_classes, dir_results, load, 
            len(train_loader)*batch_size, len(val_loader)*batch_size, 
            val_perc, batch_size, epochs, lr, optimizer, loss, evaluation, use_cuda))

    # Definition of the optimizer ADD MORE IF YOU WANT
    if optimizer == "Adam":
        optimizer = torch.optim.Adam(net.parameters(),
                             lr=lr)
    elif optimizer == "SGD":
        optimizer = torch.optim.SGD(net.parameters(),
                        lr=lr,
                        momentum=0.9,
                        weight_decay=0.0005)

    # Definition of the loss function ADD MORE IF YOU WANT
    if loss == "Dice":
        criterion = DiceLoss()
    elif loss == "RMSE":
        criterion = RMSELoss()
    elif loss == "MSE":
        criterion = nn.MSELoss()
    elif loss == "MAE":
        criterion = nn.L1Loss()
    elif loss == "CE":
        criterion = CELoss()
    elif loss == "WCE":
        criterion = WCELoss()

    # Saving History to csv
    header = ['epoch', 'train loss']

    best_loss = 10000
    time_start = time.time()
    # Run the training and validation
    for epoch in range(epochs):
        print('\nStarting epoch {}/{}.'.format(epoch + 1, epochs))

        train_loss = train_net(net, device, train_loader, optimizer, criterion, batch_size, isWCE = (loss == "WCE"))
        #val_loss = val_net(net, device, val_loader, criterion_val, batch_size)
        
        values = [epoch+1, train_loss]
        export_history(header, values, dir_results, "result"+run+".csv")
        
        # save model
        if train_loss < best_loss:
            best_loss = train_loss
            save_checkpoint({
                    'epoch': epoch + 1,
                    'state_dict': net.state_dict(),
                    'loss': train_loss,
                    'optimizer' : optimizer.state_dict(),
                }, path=dir_results, filename="weights"+run+".pth")

    time_dif = time.time() - time_start
    print("It tooks %.4f seconds to finish the run." % (time_dif))