예제 #1
0
파일: train.py 프로젝트: qijiayi-dev/OwlEye
def train():
    datafile = DATA('train', dataset_dir)
    dataloader = DataLoader(datafile,
                            batch_size=batch_size,
                            shuffle=True,
                            num_workers=workers,
                            drop_last=True)

    print('-------------train-----------------')
    print('Length of train set is {0}'.format(len(datafile)))
    model = Net()
    model = model.cuda()
    model = nn.DataParallel(model)
    model.train()
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    criterion = torch.nn.CrossEntropyLoss()
    cnt = 0
    count = 0

    for epoch in range(nepoch):
        for img, label in dataloader:
            img, label = Variable(img).cuda(), Variable(label).cuda()
            out = model(img)
            loss = criterion(out, label.squeeze())
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()
            cnt += 1
            print('Epoch:{0},Frame:{1}, train_loss {2}'.format(
                epoch, cnt * batch_size, loss / batch_size))

        torch.save(model.state_dict(),
                   '{0}/{1}model.pth'.format(model_cp, count))
        val(count)
        count += 1
def test_template(template):
    if 'troposphere/EMR_Cluster' in template:
        pytest.skip('troposphere/EMR_Cluster uses undocumented AWS::EMR::Cluster.EbsConfiguration')
    if 'OpenStack' in template:
        pytest.skip('OpenStack is not supported')
    instance = json.load(open(template))
    val.val(instance, schema)
예제 #3
0
def test_template(template):
    if 'troposphere/EMR_Cluster' in template:
        pytest.skip(
            'troposphere/EMR_Cluster uses undocumented AWS::EMR::Cluster.EbsConfiguration'
        )
    if 'OpenStack' in template:
        pytest.skip('OpenStack is not supported')
    instance = json.load(open(template))
    val.val(instance, schema)
예제 #4
0
def train(model,
          optimizer,
          scheduler,
          dataset,
          cfg,
          val_dataset=None,
          vis=True):

    training_loss_list = []
    test_loss_list = []

    for epoch in range(cfg['max_epoch']):
        true_nums = 0
        for index, (inputs, label) in enumerate(dataset):
            inputs = img_preprocess(inputs)
            outputs = model.forward(inputs)
            loss, reg_loss = model.compute_loss(label)
            grads = model.backward()
            optimizer.step(grads)

            true_num, precision = cal_precision(outputs, label)
            true_nums += true_num
            logging.info(
                "[%d/%d] train loss: %.2f, reg loss: %.2f, total loss: %.4f, precision %.4f || lr: %.6f"
                % (epoch, index, loss, reg_loss,
                   (loss + reg_loss), precision, optimizer.lr))
        scheduler.step()
        params_path = save_weights(model.params, cfg['workspace'], model.name,
                                   epoch)
        logging.info("save model at: %s, training precision %.4f" %
                     (params_path, true_nums / dataset.total))
        training_loss_list.append(true_nums / dataset.total)

        if val_dataset is not None:
            loss = val(model, model.name, params_path, val_dataset)
            test_loss_list.append(loss)

    if vis:
        draw_loss_graph(cfg['workspace'] + "/loss.png", training_loss_list,
                        test_loss_list)
def test_fn_findinmap_valid(instance):
    val.val(instance, basic_types_schema,
            definition="#/definitions/functions/Fn::FindInMap")
def test_fn_if_invalid(instance):
    with pytest.raises(jsonschema.ValidationError):
        val.val(instance, basic_types_schema,
                definition="#/definitions/condition_functions/Fn::If")
def test_lenientISO8601_valid(instance):
    val.val(instance, basic_types_schema,
            definition="#/definitions/timestamp")
def test_fn_base64_valid(instance):
    val.val(instance, basic_types_schema,
            definition="#/definitions/functions/Fn::Base64")
#!/usr/bin/env python
import sys
import val
import tools

schema = tools.load('schema.json')
template = tools.load(sys.argv[1])
val.val(template, schema)
def test_string_list_valid(instance):
    val.val(instance, basic_types_schema,
            definition="#/definitions/list<string>")
예제 #11
0
def main(args):
    # Step0 ====================================================================
    # Set GPU ids
    os.environ["CUDA_VISIBLE_DEVICES"] = args.gpu_ids

    # Set the file name format
    FILE_NAME_FORMAT = "{0}_{1}_{2}_{3:d}{4}".format(args.model, args.dataset,
                                                     args.loss, args.epochs,
                                                     args.flag)
    # Set the results file path
    RESULT_FILE_NAME = FILE_NAME_FORMAT + '_results.pkl'
    RESULT_FILE_PATH = os.path.join(RESULT_PATH, RESULT_FILE_NAME)
    # Set the checkpoint file path
    CHECKPOINT_FILE_NAME = FILE_NAME_FORMAT + '.ckpt'
    CHECKPOINT_FILE_PATH = os.path.join(CHECKPOINT_PATH, CHECKPOINT_FILE_NAME)
    BEST_CHECKPOINT_FILE_NAME = FILE_NAME_FORMAT + '_best.ckpt'
    BEST_CHECKPOINT_FILE_PATH = os.path.join(CHECKPOINT_PATH,
                                             BEST_CHECKPOINT_FILE_NAME)
    # Set the random seed same for reproducibility
    random.seed(190811)
    torch.manual_seed(190811)
    torch.cuda.manual_seed_all(190811)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

    # Step1 ====================================================================
    # Load dataset
    train_dataloader = CycleGAN_Dataloader(name=args.dataset,
                                           num_workers=args.num_workers)
    test_dataloader = CycleGAN_Dataloader(name=args.dataset,
                                          train=False,
                                          num_workers=args.num_workers)
    print('==> DataLoader ready.')

    # Step2 ====================================================================
    # Make the model
    if args.dataset == 'cityscapes':
        A_generator = Generator(num_resblock=6)
        B_generator = Generator(num_resblock=6)
        A_discriminator = Discriminator()
        B_discriminator = Discriminator()
    else:
        A_generator = Generator(num_resblock=9)
        B_generator = Generator(num_resblock=9)
        A_discriminator = Discriminator()
        B_discriminator = Discriminator()

    # Check DataParallel available
    if torch.cuda.device_count() > 1:
        A_generator = nn.DataParallel(A_generator)
        B_generator = nn.DataParallel(B_generator)
        A_discriminator = nn.DataParallel(A_discriminator)
        B_discriminator = nn.DataParallel(B_discriminator)

    # Check CUDA available
    if torch.cuda.is_available():
        A_generator.cuda()
        B_generator.cuda()
        A_discriminator.cuda()
        B_discriminator.cuda()
    print('==> Model ready.')

    # Step3 ====================================================================
    # Set each loss function
    criterion_GAN = nn.MSELoss()
    criterion_cycle = nn.L1Loss()
    criterion_identity = nn.L1Loss()
    criterion_feature = nn.L1Loss()

    # Set each optimizer
    optimizer_G = optim.Adam(itertools.chain(A_generator.parameters(),
                                             B_generator.parameters()),
                             lr=args.lr,
                             betas=(0.5, 0.999))
    optimizer_D = optim.Adam(itertools.chain(A_discriminator.parameters(),
                                             B_discriminator.parameters()),
                             lr=args.lr,
                             betas=(0.5, 0.999))

    # Set learning rate scheduler
    def lambda_rule(epoch):
        epoch_decay = args.epochs / 2
        lr_linear_scale = 1.0 - max(0, epoch + 1 - epoch_decay) \
                                / float(epoch_decay+ 1)
        return lr_linear_scale

    scheduler_G = lr_scheduler.LambdaLR(optimizer_G, lr_lambda=lambda_rule)
    scheduler_D = lr_scheduler.LambdaLR(optimizer_D, lr_lambda=lambda_rule)
    print('==> Criterion and optimizer ready.')

    # Step4 ====================================================================
    # Train and validate the model
    start_epoch = 0
    best_metric = float("inf")

    # Initialize the result lists
    train_loss_G = []
    train_loss_D_A = []
    train_loss_D_B = []

    # Set image buffer
    A_buffer = ImageBuffer(args.buffer_size)
    B_buffer = ImageBuffer(args.buffer_size)

    if args.resume:
        assert os.path.exists(CHECKPOINT_FILE_PATH), 'No checkpoint file!'
        checkpoint = torch.load(CHECKPOINT_FILE_PATH)
        A_generator.load_state_dict(checkpoint['A_generator_state_dict'])
        B_generator.load_state_dict(checkpoint['B_generator_state_dict'])
        A_discriminator.load_state_dict(
            checkpoint['A_discriminator_state_dict'])
        B_discriminator.load_state_dict(
            checkpoint['B_discriminator_state_dict'])
        optimizer_G.load_state_dict(checkpoint['optimizer_G_state_dict'])
        optimizer_D.load_state_dict(checkpoint['optimizer_D_state_dict'])
        scheduler_G.load_state_dict(checkpoint['scheduler_G_state_dict'])
        scheduler_D.load_state_dict(checkpoint['scheduler_D_state_dict'])
        start_epoch = checkpoint['epoch']
        train_loss_G = checkpoint['train_loss_G']
        train_loss_D_A = checkpoint['train_loss_D_A']
        train_loss_D_B = checkpoint['train_loss_D_B']
        best_metric = checkpoint['best_metric']

    # Save the training information
    result_data = {}
    result_data['model'] = args.model
    result_data['dataset'] = args.dataset
    result_data['loss'] = args.loss
    result_data['target_epoch'] = args.epochs
    result_data['batch_size'] = args.batch_size

    # Check the directory of the file path
    if not os.path.exists(os.path.dirname(RESULT_FILE_PATH)):
        os.makedirs(os.path.dirname(RESULT_FILE_PATH))
    if not os.path.exists(os.path.dirname(CHECKPOINT_FILE_PATH)):
        os.makedirs(os.path.dirname(CHECKPOINT_FILE_PATH))
    print('==> Train ready.')

    for epoch in range(args.epochs):
        # strat after the checkpoint epoch
        if epoch < start_epoch:
            continue

        print("\n[Epoch: {:3d}/{:3d}]".format(epoch + 1, args.epochs))
        epoch_time = time.time()
        #=======================================================================
        # train and validate the model
        tloss_G, tloss_D = train(
            train_dataloader, A_generator, B_generator, A_discriminator,
            B_discriminator, criterion_GAN, criterion_cycle,
            criterion_identity, optimizer_G, optimizer_D, A_buffer, B_buffer,
            args.loss, args.lambda_cycle, args.lambda_identity,
            criterion_feature, args.lambda_feature, args.attention)
        train_loss_G.append(tloss_G)
        train_loss_D_A.append(tloss_D['A'])
        train_loss_D_B.append(tloss_D['B'])

        if (epoch + 1) % 10 == 0:
            val(test_dataloader, A_generator, B_generator, A_discriminator,
                B_discriminator, epoch + 1, FILE_NAME_FORMAT, args.attention)

        # Update the optimizer's learning rate
        current_lr = optimizer_G.param_groups[0]['lr']
        scheduler_G.step()
        scheduler_D.step()
        #=======================================================================
        current = time.time()

        # Save the current result
        result_data['current_epoch'] = epoch
        result_data['train_loss_G'] = train_loss_G
        result_data['train_loss_D_A'] = train_loss_D_A
        result_data['train_loss_D_B'] = train_loss_D_B

        # Save result_data as pkl file
        with open(RESULT_FILE_PATH, 'wb') as pkl_file:
            pickle.dump(result_data,
                        pkl_file,
                        protocol=pickle.HIGHEST_PROTOCOL)

        # Save the best checkpoint
        # if train_loss_G < best_metric:
        #     best_metric = train_loss_G
        #     torch.save({
        #         'epoch': epoch+1,
        #         'A_generator_state_dict': A_generator.state_dict(),
        #         'B_generator_state_dict': B_generator.state_dict(),
        #         'A_discriminator_state_dict': A_discriminator.state_dict(),
        #         'B_discriminator_state_dict': B_discriminator.state_dict(),
        #         'optimizer_G_state_dict': optimizer_G.state_dict(),
        #         'optimizer_D_state_dict': optimizer_D.state_dict(),
        #         'scheduler_G_state_dict': scheduler_G.state_dict(),
        #         'scheduler_D_state_dict': scheduler_D.state_dict(),
        #         'train_loss_G': train_loss_G,
        #         'train_loss_D_A': train_loss_D_A,
        #         'train_loss_D_B': train_loss_D_B,
        #         'best_metric': best_metric,
        #         }, BEST_CHECKPOINT_FILE_PATH)

        # Save the current checkpoint
        torch.save(
            {
                'epoch': epoch + 1,
                'A_generator_state_dict': A_generator.state_dict(),
                'B_generator_state_dict': B_generator.state_dict(),
                'A_discriminator_state_dict': A_discriminator.state_dict(),
                'B_discriminator_state_dict': B_discriminator.state_dict(),
                'optimizer_G_state_dict': optimizer_G.state_dict(),
                'optimizer_D_state_dict': optimizer_D.state_dict(),
                'scheduler_G_state_dict': scheduler_G.state_dict(),
                'scheduler_D_state_dict': scheduler_D.state_dict(),
                'train_loss_G': train_loss_G,
                'train_loss_D_A': train_loss_D_A,
                'train_loss_D_B': train_loss_D_B,
                'best_metric': best_metric,
            }, CHECKPOINT_FILE_PATH)

        if (epoch + 1) % 10 == 0:
            CHECKPOINT_FILE_NAME_epoch = FILE_NAME_FORMAT + '_{0}.ckpt'
            CHECKPOINT_FILE_PATH_epoch = os.path.join(
                CHECKPOINT_PATH, FILE_NAME_FORMAT, CHECKPOINT_FILE_NAME_epoch)
            if not os.path.exists(os.path.dirname(CHECKPOINT_FILE_PATH_epoch)):
                os.makedirs(os.path.dirname(CHECKPOINT_FILE_PATH_epoch))
            torch.save(
                {
                    'epoch': epoch + 1,
                    'A_generator_state_dict': A_generator.state_dict(),
                    'B_generator_state_dict': B_generator.state_dict(),
                    'A_discriminator_state_dict': A_discriminator.state_dict(),
                    'B_discriminator_state_dict': B_discriminator.state_dict(),
                    'optimizer_G_state_dict': optimizer_G.state_dict(),
                    'optimizer_D_state_dict': optimizer_D.state_dict(),
                    'scheduler_G_state_dict': scheduler_G.state_dict(),
                    'scheduler_D_state_dict': scheduler_D.state_dict(),
                    'train_loss_G': train_loss_G,
                    'train_loss_D_A': train_loss_D_A,
                    'train_loss_D_B': train_loss_D_B,
                    'best_metric': best_metric,
                }, CHECKPOINT_FILE_PATH_epoch)

        # Print the information on the console
        print("model                : {}".format(args.model))
        print("dataset              : {}".format(args.dataset))
        print("loss                 : {}".format(args.loss))
        print("batch_size           : {}".format(args.batch_size))
        print("current lrate        : {:f}".format(current_lr))
        print("G loss               : {:f}".format(tloss_G))
        print("D A/B loss           : {:f}/{:f}".format(
            tloss_D['A'], tloss_D['B']))
        print("epoch time           : {0:.3f} sec".format(current -
                                                          epoch_time))
        print("Current elapsed time : {0:.3f} sec".format(current - start))
    print('==> Train done.')

    print(' '.join(['Results have been saved at', RESULT_FILE_PATH]))
    print(' '.join(['Checkpoints have been saved at', CHECKPOINT_FILE_PATH]))
예제 #12
0
파일: train.py 프로젝트: sx14/HOI-det
def main(data_root, config):

    log_dir = 'logs'
    if os.path.exists(log_dir):
        import shutil
        shutil.rmtree(log_dir)
    logger = SummaryWriter(log_dir)

    print_freq = config['print_freq']
    save_freq = config['save_freq']
    data_save_dir = config['data_save_dir']
    model_save_dir = config['model_save_dir']
    if not os.path.exists(model_save_dir):
        os.mkdir(model_save_dir)

    print('===== preparing =====')
    hoi_db = prepare_hico(data_root, data_save_dir)
    test_dataset = HICODatasetSpa(hoi_db['val'])
    test_dataloader = DataLoader(test_dataset, batch_size=32, shuffle=True)
    train_dataset = HICODatasetSpa(hoi_db['train'])
    train_dataloader = DataLoader(train_dataset, batch_size=32, shuffle=True)
    hoi_classes_path = os.path.join(data_root, 'hoi_categories.pkl')
    hoi_classes, _, _, hoi2int = load_hoi_classes(hoi_classes_path)
    print('===== done =====')

    model = SpaLan(config['spa_feature_dim'], config['num_hoi_classes'],
                   config['num_obj_classes'], config['num_key_points'])
    model = model.cuda()

    # Optimizer
    lr = config['learning_rate']
    lr_adjust_freq = config['lr_adjust_freq']
    wd = config['weight_decay']
    mt = config['momentum']

    batch_count = 0
    last_print_time = time.time()
    for epoch in range(config['n_epochs']):
        model.train()
        optimizer = torch.optim.SGD([{
            'params': model.parameters()
        }],
                                    lr=lr,
                                    momentum=mt,
                                    weight_decay=wd,
                                    nesterov=True)

        for data in train_dataloader:
            batch_count += 1

            spa_maps = Variable(data[0]).cuda()
            obj_vecs = Variable(data[1]).cuda()
            hoi_cates = Variable(data[2]).cuda()
            bin_cates = Variable(data[3]).cuda()
            obj_cates = Variable(data[4]).cuda()
            pose_feats = Variable(data[5]).cuda()

            pos_mask = torch.eq(bin_cates, 0)
            if pos_mask.sum().item() == 0:
                continue

            optimizer.zero_grad()
            bin_prob, hoi_prob, \
            loss_bin, loss_hoi, \
            error_bin, error_hoi = model(spa_maps, obj_cates, pose_feats, hoi_cates, bin_cates, pos_mask)

            # loss = loss_bin + loss_hoi
            loss = loss_hoi
            loss.backward()
            optimizer.step()

            logger.add_scalars(
                'loss', {
                    'all': loss.data.item(),
                    'bin': loss_bin.data.item(),
                    'hoi': loss_hoi.data.item()
                }, batch_count)
            logger.add_scalars('error', {
                'bin': error_bin.data.item(),
                'hoi': error_hoi.data.item()
            }, batch_count)

            if batch_count % print_freq == 0:
                curr_time = time.time()
                print('[Epoch %d][Batch %d] loss: %.4f time: %.2fs' %
                      (epoch, batch_count, loss.data.item(),
                       curr_time - last_print_time))
                print('\t\tloss_bin: %.4f\t\tloss_cls: %.4f' %
                      (loss_bin.data.item(), loss_hoi.data.item()))
                print('\t\terror_bin: %.4f\t\terror_hoi: %.4f' %
                      (error_bin.data.item(), error_hoi.data.item()))
                last_print_time = curr_time

        model.eval()
        error_bin_avg, error_hoi_avg = val(model,
                                           test_dataloader,
                                           hoi_classes,
                                           hoi2int,
                                           show=False)
        logger.add_scalars('error_val', {
            'bin': error_bin_avg,
            'hoi': error_hoi_avg
        }, epoch)
        if (epoch + 1) % save_freq == 0:
            model_file = os.path.join(model_save_dir,
                                      '%s_%d_weights.pkl' % (model, epoch))
            torch.save(model.state_dict(), model_file)
            np.save(
                os.path.join(model_save_dir, '%s_%d_lr.pkl' % (model, epoch)),
                lr)

        if (epoch + 1) % lr_adjust_freq == 0:
            lr = lr * 0.6

    logger.close()
예제 #13
0
def main():

    # Define task.
    parser = argparse.ArgumentParser(
        description='Large-scale deep learning framework.')
    parser.add_argument('--task',
                        metavar='NAME',
                        type=str,
                        required=True,
                        help='specify a task name that defined in $ROOT/task/')
    arg = parser.parse_args(sys.argv[1:3])
    task = importlib.import_module(arg.task)

    # Get task-specific options and print.
    task_opt = task.Option()
    opt = task_opt.opt
    print('Options.')
    for k in sorted(vars(opt)):
        if not k.startswith('dst_dir'):
            print('  {0}: {1}'.format(k, opt.__dict__[k]))

    # Build db.
    dst_dir_db = os.path.join(opt.dst_dir, opt.db)
    dst_path_db = os.path.join(dst_dir_db, 'db.pth')
    try:
        db = torch.load(dst_path_db)
        print('DB loaded.')
    except:
        db_module = importlib.import_module(opt.db)
        print('Make train DB.')
        db_train = db_module.make_dataset_train(opt.db_root)
        print('Make val DB.')
        db_val = db_module.make_dataset_val(opt.db_root)
        print('Save DB.')
        db = {'train': db_train, 'val': db_val}
        os.makedirs(dst_dir_db, exist_ok=True)
        torch.save(db, dst_path_db)
    db_train = db['train']
    db_val = db['val']

    # Estimate input statistics.
    dst_path_input_stats = os.path.join(dst_dir_db, 'db_stats.pth')
    try:
        input_stats = torch.load(dst_path_input_stats)
        print('DB input stats loaded.')
    except:
        print('Estimate DB input stats.')
        batch_manager_train = task.BatchManagerTrain(db_train, opt)
        input_stats = batch_manager_train.estimate_input_stats()
        os.makedirs(dst_dir_db, exist_ok=True)
        torch.save(input_stats, dst_path_input_stats)

    # Set destimation model directory.
    dst_dir_model = os.path.join(dst_dir_db, opt.arch)
    if opt.start_from:
        assert opt.start_from.endswith('.pth.tar')
        dst_dir_model = opt.start_from[:-8]
    if task_opt.changes:
        dst_dir_model += ',' + task_opt.changes

    # Apply active learning step to source model path, destination model directory/path.
    start_from = opt.start_from
    start_from_db = None
    for stage in range(opt.stage):
        start_from = os.path.join(dst_dir_model,
                                  '{:03d}.pth.tar'.format(opt.num_epoch))
        start_from_db = os.path.join(dst_dir_model, 'db_active.pth')
        assert opt.num_epoch == len(utils.Logger(os.path.join(dst_dir_model, 'val.log'))), \
                'Finish training before the next active learning stage.'
        dst_dir_model = os.path.join(
            dst_dir_model,
            '{:03d},sampler={},stage={}'.format(opt.num_epoch, opt.sampler,
                                                stage + 1))
    dst_path_model = os.path.join(dst_dir_model, '{:03d}.pth.tar')
    print('Active learning stage {}.'.format(opt.stage))

    # Initialize model, criterion, optimizer.
    model = task.Model(opt)

    # Create loggers.
    logger_train = utils.Logger(os.path.join(dst_dir_model, 'train.log'))
    logger_val = utils.Logger(os.path.join(dst_dir_model, 'val.log'))
    assert len(logger_train) == len(logger_val)

    # If models trained before, update informations to resume training.
    best_perform = 0
    start_epoch = len(logger_train)
    if start_epoch > 0:
        best_perform = logger_val.max()
        start_from = dst_path_model.format(start_epoch)
        start_from_db = os.path.join(dst_dir_model, 'db_active.pth')
    if start_epoch == opt.num_epoch:
        print('All done.')
        return

    # Fetch previouse parameters from that to resume training.
    dst_path_db_active = os.path.join(dst_dir_model, 'db_active.pth')
    os.makedirs(dst_dir_model, exist_ok=True)
    if start_from:
        print('Load a model from that to resume training.\n'
              '({})'.format(start_from))
        checkpoint = torch.load(start_from)
        model.model.load_state_dict(checkpoint['state_dict'])
        model.optimizer.load_state_dict(checkpoint['optimizer'])
        print('Load active DB.')
        data = torch.load(start_from_db)
        db_active = ActiveDB(data, db_val, task.BatchManagerTrain,
                             task.BatchManagerVal, input_stats, model, opt,
                             dst_dir_model)
        if start_epoch == 0:
            print('Increase active DB labels.')
            db_active.increase_labels()
            print('Save increased active DB.')
            torch.save(db_active.db, dst_path_db_active)
    else:
        print('Make initial active DB.')
        data = {'pairs': [], 'pool': db_train['pairs'], 'log': [[]]}
        db_active = ActiveDB(data, db_val, task.BatchManagerTrain,
                             task.BatchManagerVal, input_stats, model, opt,
                             dst_dir_model)
        print('Save initial active DB.')
        torch.save(db_active.db, dst_path_db_active)

    # Set training db.
    db_train = db_active.db

    # Create batch manager.
    batch_manager_train = task.BatchManagerTrain(db_train, opt)
    batch_manager_train.set_input_stats(input_stats)
    batch_manager_val = task.BatchManagerVal(db_val, opt)
    batch_manager_val.set_input_stats(input_stats)

    # Cache input data if necessary.
    if opt.cache_train_data:
        batch_manager_train.cache_data()
    if opt.cache_val_data:
        batch_manager_val.cache_data()

    # If evaluation mode, evaluate the model and exit.
    if opt.evaluate:
        return val.val(batch_manager_val, model)

    # Do the job.
    cudnn.benchmark = True
    os.makedirs(dst_dir_model, exist_ok=True)
    for epoch in range(start_epoch, opt.num_epoch):

        # Adjust learning rate before training.
        learn_rate = opt.learn_rate * (0.1**(epoch // opt.decay_epoch))
        for param_group in model.optimizer.param_groups:
            param_group['lr'] = learn_rate

        # Train.
        print('\nStart training at epoch {}.'.format(epoch + 1))
        train.train(batch_manager_train, model, logger_train, epoch + 1)

        # Val.
        print('\nStart validation at epoch {}.'.format(epoch + 1))
        perform = val.val(batch_manager_val, model, logger_val, epoch + 1)

        # Save model.
        print('\nSave this model.')
        data = {
            'opt': opt,
            'log_train': logger_train.read(),
            'log_val': logger_val.read(),
            'state_dict': model.model.state_dict(),
            'optimizer': model.optimizer.state_dict()
        }
        torch.save(data, dst_path_model.format(epoch + 1))

        # Remove previous model.
        if epoch > 0:
            print('Remove the previous model.')
            os.system('rm {}'.format(dst_path_model.format(epoch)))

        # Backup the best model.
        if perform > best_perform:
            print('Backup this model as the best.')
            os.system('cp {} {}'.format(
                dst_path_model.format(epoch + 1),
                os.path.join(dst_dir_model, 'best.pth.tar')))
            best_perform = perform
예제 #14
0
def main(args):
    #===========================================================================
    # Set the file name format
    FILE_NAME_FORMAT = "{0}_{1}_{2:d}_{3:d}_{4}_{5:.4f}_{6:.6f}{7}".format(
                                    args.model, args.dataset, args.epochs,
                                    args.batch_size, args.optimizer,
                                    args.weight_decay, args.lr, args.flag)
    # Set the results file path
    RESULT_FILE_NAME = FILE_NAME_FORMAT+'_results.pkl'
    RESULT_FILE_PATH = os.path.join(RESULTS_PATH, RESULT_FILE_NAME)
    # Set the checkpoint file path
    CHECKPOINT_FILE_NAME = FILE_NAME_FORMAT+'.ckpt'
    CHECKPOINT_FILE_PATH = os.path.join(CHECKPOINT_PATH, CHECKPOINT_FILE_NAME)
    BEST_CHECKPOINT_FILE_NAME = FILE_NAME_FORMAT+'_best.ckpt'
    BEST_CHECKPOINT_FILE_PATH = os.path.join(CHECKPOINT_PATH,
                                                BEST_CHECKPOINT_FILE_NAME)

    # Set the random seed same
    torch.manual_seed(190811)
    torch.cuda.manual_seed(190811)
    torch.cuda.manual_seed_all(190811)
    random.seed(190811)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

    # Step1 ====================================================================
    # Load dataset
    if args.dataset == 'VOC2011':
        voc = VOC2011_Dataloader()
    elif args.dataset == 'VOC2012':
        voc = VOC2012_Dataloader()
    else:
        assert False, "Please select the proper dataset!"
    train_loader = voc.get_train_loader(batch_size=args.batch_size,
                                            num_workers=args.num_workers)
    val_loader = voc.get_val_loader(batch_size=args.batch_size,
                                            num_workers=args.num_workers)
    print('==> DataLoader ready.')

    # Step2 ====================================================================
    # Make FCN model
    if args.model == 'FCN_AlexNet':
        model = FCN_AlexNet()
    elif args.model == 'FCN_VGG16':
        model = FCN_VGG16()
    elif args.model == 'FCN_GoogLeNet':
        model = FCN_GoogLeNet()
    elif args.model == 'FCN_32s':
        model = FCN_32s()
    elif args.model == 'FCN_32s_fixed':
        model = FCN_32s_fixed()
    elif args.model == 'FCN_16s':
        model = FCN_16s()
        model.load_state_dict(torch.load('./model/pretrained/FCN32s_'
                                    +args.dataset+'_'+args.optimizer
                                    )['model_state_dict'], strict=False)
    elif args.model == 'FCN_8s':
        model = FCN_8s()
        model.load_state_dict(torch.load('./model/pretrained/FCN16s_'
                                    +args.dataset+'_'+args.optimizer
                                    )['model_state_dict'], strict=False)
    elif args.model == 'FCN_4s':
        model = FCN_4s()
        model.load_state_dict(torch.load('./model/pretrained/FCN8s_'
                                    +args.dataset+'_'+args.optimizer
                                    )['model_state_dict'], strict=False)
    elif args.model == 'FCN_2s':
        model = FCN_2s()
        model.load_state_dict(torch.load('./model/pretrained/FCN4s_'
                                    +args.dataset+'_'+args.optimizer
                                    )['model_state_dict'], strict=False)
    elif args.model == 'FCN_1s':
        model = FCN_1s()
        model.load_state_dict(torch.load('./model/pretrained/FCN2s_'
                                    +args.dataset+'_'+args.optimizer
                                    )['model_state_dict'], strict=False)
    elif args.model == 'DeconvNet':
        model = DeconvNet()
    else:
        assert False, "Please select the FCN model"

    # Check DataParallel available
    if torch.cuda.device_count() > 1:
        model = nn.DataParallel(model)

    # Check CUDA available
    if torch.cuda.is_available():
        model.cuda()
    print('==> Model ready.')

    # Step3 ====================================================================
    # Set loss function and optimizer
    criterion = nn.CrossEntropyLoss()

    # Separate parameters for bias double learning rate
    normal_parameters = []
    double_parameters = []
    for name, parameter in model.named_parameters():
        if 'bias' in name:
            double_parameters.append(parameter)
        else:
            normal_parameters.append(parameter)

    # Select the optimizer
    if args.optimizer == 'SGD':
        optimizer = optim.SGD([
            {'params': normal_parameters},
            {'params': double_parameters, 'lr':args.lr*2,
                                          'weight_decay': args.weight_decay*0},
            ], lr=args.lr, momentum=0.9, weight_decay=args.weight_decay)
    elif args.optimizer == 'Adam':
        optimizer = optim.Adam([
            {'params': normal_parameters},
            {'params': double_parameters, 'lr':args.lr*2,
                                          'weight_decay': args.weight_decay*0},
            ], lr=args.lr, betas=(0.9, 0.999), weight_decay=args.weight_decay)
    else:
        assert False, "Please select the proper optimizer."

    # Set the learning rate scheduler
    # scheduler = ReduceLROnPlateau(optimizer, factor=0.1, patience=10, verbose=True, threshold=1e-4)
    print('==> Criterion and optimizer ready.')

    # Step4 ====================================================================
    # Train and validate the model
    start_epoch = 0
    best_val_mean_IoU = 0

    if args.resume:
        assert os.path.exists(CHECKPOINT_FILE_PATH), 'No checkpoint file!'
        checkpoint = torch.load(CHECKPOINT_FILE_PATH)
        model.load_state_dict(checkpoint['model_state_dict'])
        optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
        start_epoch = checkpoint['epoch']

    # Save the training information
    result_data = {}
    result_data['model']            = args.model
    result_data['dataset']          = args.dataset
    result_data['target epoch']     = args.epochs
    result_data['batch_size']       = args.batch_size
    result_data['optimizer']        = args.optimizer
    result_data['weight_decay']     = args.weight_decay
    result_data['lr']               = args.lr

    # Initialize the result lists
    train_loss = []
    train_pixel_acc = []
    train_mean_acc = []
    train_mean_IoU = []
    train_frew_IoU = []
    val_loss = []
    val_pixel_acc = []
    val_mean_acc = []
    val_mean_IoU = []
    val_frew_IoU = []

    # Check the directory of the file path
    if not os.path.exists(os.path.dirname(RESULT_FILE_PATH)):
        os.makedirs(os.path.dirname(RESULT_FILE_PATH))
    if not os.path.exists(os.path.dirname(CHECKPOINT_FILE_PATH)):
        os.makedirs(os.path.dirname(CHECKPOINT_FILE_PATH))
    print('==> Train ready.')

    for epoch in range(args.epochs):
        # strat after the checkpoint epoch
        if epoch < start_epoch:
            continue
        print("\n[Epoch: {:3d}/{:3d}]".format(epoch+1, args.epochs))
        epoch_time = time.time()
        #=======================================================================
        # train the model
        tloss, tmetric = train(model, train_loader, criterion, optimizer)
        train_loss.append(tloss)
        train_pixel_acc.append(tmetric[0])
        train_mean_acc.append(tmetric[1])
        train_mean_IoU.append(tmetric[2])
        train_frew_IoU.append(tmetric[3])

        # validate the model
        vloss, vmetric = val(model, val_loader, criterion)
        val_loss.append(vloss)
        val_pixel_acc.append(vmetric[0])
        val_mean_acc.append(vmetric[1])
        val_mean_IoU.append(vmetric[2])
        val_frew_IoU.append(vmetric[3])

        # update learning rate
        # scheduler.step(vloss)
        #=======================================================================
        current = time.time()

        # Save the current result
        result_data['current epoch']    = epoch
        result_data['train_loss']       = train_loss
        result_data['train_pixel_acc']  = train_pixel_acc
        result_data['train_mean_acc']   = train_mean_acc
        result_data['train_mean_IoU']   = train_mean_IoU
        result_data['train_frew_IoU']   = train_frew_IoU
        result_data['val_loss']         = val_loss
        result_data['val_pixel_acc']    = val_pixel_acc
        result_data['val_mean_acc']     = val_mean_acc
        result_data['val_mean_IoU']     = val_mean_IoU
        result_data['val_frew_IoU']     = val_frew_IoU

        # Save result_data as pkl file
        with open(RESULT_FILE_PATH, 'wb') as pkl_file:
            pickle.dump(result_data, pkl_file, protocol=pickle.HIGHEST_PROTOCOL)



        # Save the best checkpoint
        if vmetric[2] > best_val_mean_IoU:
            best_val_mean_IoU = vmetric[2]
            torch.save({
                'epoch': epoch+1,
                'model_state_dict': model.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
                'best_val_mean_IoU': best_val_mean_IoU,
                }, BEST_CHECKPOINT_FILE_PATH)

        # Save the current checkpoint
        torch.save({
            'epoch': epoch+1,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'val_mean_IoU': vmetric[2]
            }, CHECKPOINT_FILE_PATH)

        # Print the information on the console
        print("model                : {}".format(args.model))
        print("dataset              : {}".format(args.dataset))
        print("batch_size           : {}".format(args.batch_size))
        print("optimizer            : {}".format(args.optimizer))
        print("learning rate        : {:f}".format(optimizer.param_groups[0]['lr']))
        print("weight decay         : {:f}".format(optimizer.param_groups[0]['weight_decay']))
        print("train/val loss       : {:f}/{:f}".format(tloss,vloss))
        print("train/val pixel acc  : {:f}/{:f}".format(tmetric[0],vmetric[0]))
        print("train/val mean acc   : {:f}/{:f}".format(tmetric[1],vmetric[1]))
        print("train/val mean IoU   : {:f}/{:f}".format(tmetric[2],vmetric[2]))
        print("train/val frew IoU   : {:f}/{:f}".format(tmetric[3],vmetric[3]))
        print("epoch time     : {0:.3f} sec".format(current - epoch_time))
        print("Current elapsed time: {0:.3f} sec".format(current - start))
    print('==> Train done.')

    print(' '.join(['Results have been saved at', RESULT_FILE_PATH]))
    print(' '.join(['Checkpoints have been saved at', CHECKPOINT_FILE_PATH]))
예제 #15
0
    model = model.to(device)

    # == Load Data ==
    root = 'D:/data/dogs_cats'
    dataset_train = DogCatData(root, mode='train')
    dataset_val = DogCatData(root, mode='val')
    train_data = DataLoader(dataset_train,
                            shuffle=True,
                            batch_size=32,
                            num_workers=4)
    val_data = DataLoader(dataset_val,
                          shuffle=True,
                          batch_size=32,
                          num_workers=4)

    # == optimizer ==
    criterion = torch.nn.CrossEntropyLoss().to(device)
    optimizer = torch.optim.Adam(model.parameters())

    # == Main Loop ==
    max_acc = 0
    max_epoch = 1
    for epoch in range(max_epoch):
        train(model, train_data, epoch, criterion, optimizer)
        acc = val(model, val_data)
        if acc > max_acc:
            max_acc = acc
            torch.save(model, 'checkpoints/lenet_max.pt')

    print('==========Max Acc: {}=========='.format(max_acc))
예제 #16
0
def train(generator, discriminator, train_loader, criterion, optimizer_G,
          optimizer_D, clipping, num_critic, step_counter, validate_noise,
          FILE_NAME_FORMAT):
    generator.train()
    discriminator.train()
    device = next(generator.parameters()).device.index
    losses_G = []
    losses_D = []
    distances = []
    total_iter = len(train_loader)

    for i, (images, _) in enumerate(train_loader):
        real_images = images.cuda(device)

        #=======================================================================
        # For WGAN
        if criterion is None:
            #-------------------------------------------------------------------
            ''' Train Discriminator Network '''
            #-------------------------------------------------------------------

            # Make real & fake lables and noise for generator
            batch_size = real_images.size(0)
            real_label = torch.full((batch_size, ), 1, device=device)
            fake_label = torch.full((batch_size, ), 0, device=device)
            noise = torch.randn(batch_size, 100, 1, 1, device=device)

            # Empty discriminator's gradients
            discriminator.zero_grad()

            # Generate fake images from noise
            fake_images = generator(noise)

            # Discriminate the images
            output_real = discriminator(real_images)
            output_fake = discriminator(fake_images.detach())

            # Calculate loss
            loss_D = -output_real.mean() + output_fake.mean()
            losses_D.append(loss_D.item())

            # Calculate gradients (Backpropagation)
            loss_D.backward()

            # Calculate Earth Mover(EM) distance
            distance = output_real.mean() - output_fake.mean()
            distances.append(distance)

            # Update discriminator's parameters
            optimizer_D.step()

            # Cliping the discriminator's parameters
            for parameter in discriminator.parameters():
                parameter.data.clamp_(-clipping, clipping)

            #-------------------------------------------------------------------
            ''' Train Generator Network '''
            #-------------------------------------------------------------------
            loss_G = -output_fake.mean()

            # For every num_critic iteration
            if step_counter.current_step % num_critic == 0:

                # Empty generator's gradients
                generator.zero_grad()

                # Generate fake images from noise
                fake_images = generator(noise)

                # Discriminate the images
                output_fake = discriminator(fake_images)

                # Calculate loss
                loss_G = -output_fake.mean()
                losses_G.extend([loss_G.item()] * num_critic)

                # Calculate gradients (Backpropagation)
                loss_G.backward()

                # Update generator's parameters
                optimizer_G.step()
        #=======================================================================
        # For DCGAN
        else:
            #-------------------------------------------------------------------
            ''' Train Discriminator Network '''
            # maximize log(D(x)) + log(1 - D(G(z)))
            #-------------------------------------------------------------------

            # Make real & fake lables and noise for generator
            batch_size = real_images.size(0)
            real_label = torch.full((batch_size, ), 1, device=device)
            fake_label = torch.full((batch_size, ), 0, device=device)
            noise = torch.randn(batch_size, 100, 1, 1, device=device)

            # <For real images> ------------------------------------------------
            # Empty discriminator's gradients
            discriminator.zero_grad()

            # Predict targets (Forward propagation)
            output = discriminator(real_images)
            pred_label = torch.sigmoid(output).view(-1)

            # Calculate loss (for real images)
            loss_D_real = criterion(pred_label, real_label)

            # Calculate gradients (Backpropagation)
            loss_D_real.backward()

            # <For fake images> ------------------------------------------------
            # Generate fake images from noise
            fake_images = generator(noise)

            # Discriminate the images
            output = discriminator(fake_images.detach())
            pred_label = torch.sigmoid(output).view(-1)

            # Calculate loss (for fake images)
            loss_D_fake = criterion(pred_label, fake_label)

            # Calculate gradients (Backpropagation)
            loss_D_fake.backward()

            # Make mixed error (Add the both of gradients) ---------------------
            loss_D = loss_D_real + loss_D_fake
            losses_D.append(loss_D.item())

            # Update discriminator's parameters
            optimizer_D.step()

            #-------------------------------------------------------------------
            ''' Train Generator Network '''
            # maximize log(D(G(z)))
            #-------------------------------------------------------------------

            # Empty generator's gradients
            generator.zero_grad()

            # Calculate loss (for fake images)
            output = discriminator(fake_images)
            pred_label = torch.sigmoid(output).view(-1)

            # Calculate loss (for fake images)
            loss_G = criterion(pred_label, real_label)
            losses_G.append(loss_G.item())

            # Calculate gradients (Backpropagation)
            loss_G.backward()

            # Update discriminator's parameters
            optimizer_G.step()

            #-------------------------------------------------------------------
            # Calculate Jenen-Shannon Divergence
            target_label = torch.full((batch_size, ), 0.5)
            distance = jensenshannon(pred_label.detach().cpu(),
                                     target_label)**2
            # Ref: Jensen-shannon Distance = sqrt(Jenen-Shannon Divergence)
            distances.append(distance)

        #=======================================================================
        # Count the step
        step_counter.step()

        # Display current status
        print("[{:5d}/{:5d}]".format(i + 1, total_iter), end='')
        print(" loss_G: {:f} loss_D: {:f} dist: {:f} step: {:d}   \r".format(
            loss_G, loss_D, distance, step_counter.current_step),
              end='')

        # validate the model
        if (step_counter.current_step % 500 == 0
                or step_counter.current_step == step_counter.objective_step):
            val(generator, validate_noise, step_counter, FILE_NAME_FORMAT)

        # Check the current step
        if step_counter.current_step >= step_counter.objective_step:
            step_counter.exit_signal = True
            break

    #===========================================================================
    return losses_G, losses_D, distances
def test_valid():
    val.val(valid, resource_schema)
예제 #18
0
파일: main.py 프로젝트: flamz3d/vae-pytorch
    print("{} model chosen.\n".format(opt.model))

    vae = Model(vae_model,z_dim=opt.z_dim)

    best_loss = float("inf")
    best_epoch = -1

    for epoch in range(opt.epochs):

        for m in metrics:
            m.reset()

        print("====== Epoch {} ======".format(epoch))
        train(epoch, vae, t_generator, compute_vae, metrics, (models_folder, maps_folder), opt, train_logger)
        vae_loss,log_p_x = val(epoch, vae, v_generator, compute_vae, metrics, (models_folder, maps_folder), opt, val_logger)
        
        is_best = False
        if vae_loss < best_loss:
            best_loss = vae_loss
            best_epoch = epoch
            is_best = True

        internal_state = {
            'model':opt.model,
            'dataset': opt.dataset,
            'z_dim': opt.z_dim,
            'current_epoch': epoch,
            'best_epoch': best_epoch,
            'best_loss': best_loss,
            'model_vae_state_dict': vae.vae.state_dict(),
예제 #19
0
def test_invalid():
    with pytest.raises(jsonschema.ValidationError):
        val.val(invalid, resource_schema)
예제 #20
0
    def _agent_sampler(self, mode):
        # Define fuctions.
        if mode == 'reg':

            def _gen_agent_target(posteriors, targets):
                return [
                    0.5 - posterior[0, targets[i][0]]
                    for i, posterior in enumerate(posteriors)
                ]

            def is_best(eval_best, eval_current):
                return eval_best > eval_current

            num_out_dim = 1
            type_fun = float
            criterion = nn.MSELoss()
            evaluator = metric.mse
            learn_rate = 0.001

        elif mode == 'cls':

            def _gen_agent_target(posteriors, targets):
                return [
                    posterior[0, targets[i][0]] != posterior.max()
                    for i, posterior in enumerate(posteriors)
                ]

            def ap(outputs, targets):
                return metric.ap(F.softmax(outputs)[:, [1]].data, targets)

            def is_best(eval_best, eval_current):
                return eval_best < eval_current

            num_out_dim = 2
            type_fun = int
            criterion = nn.CrossEntropyLoss()
            evaluator = ap
            learn_rate = 0.01

        def _make_agent_dataset(db, db_indices, agent_targets, targets):
            pairs = []
            for i, agent_target in enumerate(agent_targets):
                image, target = db['pairs'][db_indices[i]]
                assert target == targets[i][0]
                pairs.append((image, type_fun(agent_target)))
            return {'pairs': pairs}

        # Predict class posteriors to compute agent targets.
        posteriors_train, targets_train, db_indices_train = self._compute_posteriors(
            self._db['pairs'], self._model)
        posteriors_val, targets_val, db_indices_val = self._compute_posteriors(
            self._db_val['pairs'], self._model)

        # Generate agent targets.
        agent_targets_train = _gen_agent_target(posteriors_train,
                                                targets_train)
        agent_targets_val = _gen_agent_target(posteriors_val, targets_val)

        # Create input-target pairs to learn an agent.
        agent_db_train = _make_agent_dataset(self._db, db_indices_train,
                                             agent_targets_train,
                                             targets_train)
        agent_db_val = _make_agent_dataset(self._db_val, db_indices_val,
                                           agent_targets_val, targets_val)

        # Create batch managers to learn an agent.
        batch_manager_train = self._BatchManagerTrain(agent_db_train,
                                                      self._opt,
                                                      self._input_stats)
        batch_manager_train._evaluator = evaluator
        batch_manager_val = self._BatchManagerVal(agent_db_val, self._opt,
                                                  self._input_stats)
        batch_manager_val._evaluator = evaluator

        # Cache input data if necessary.
        if self._opt.cache_train_data:
            batch_manager_train.cache_data()
        if self._opt.cache_val_data:
            batch_manager_val.cache_data()

        # Create loggers.
        logger_train = utils.Logger(
            os.path.join(self._dst_dir_agent, 'agent-train.log'))
        logger_val = utils.Logger(
            os.path.join(self._dst_dir_agent, 'agent-val.log'))
        assert len(logger_train) == 0 and len(logger_val) == 0

        # Initialize an agent from the model.
        model = deepcopy(self._model.model.module)
        model.fc = nn.Linear(model.fc.in_features, num_out_dim)
        model = torch.nn.DataParallel(model)
        model = model.cuda()
        optimizer = torch.optim.SGD(model.parameters(),
                                    self._opt.learn_rate,
                                    momentum=self._opt.momentum,
                                    weight_decay=self._opt.weight_decay)

        class Model(object):
            def __init__(self):
                self.model = model
                self.criterion = criterion.cuda()
                self.optimizer = optimizer

        agent = Model()

        # Learn the agent.
        best_perform = 0
        for param_group in agent.optimizer.param_groups:
            param_group['lr'] = learn_rate
        for epoch in range(3):
            print('\nStart agent training at epoch {}.'.format(epoch + 1))
            train.train(batch_manager_train, agent, logger_train, epoch + 1)
            print('\nStart agent validation at epoch {}.'.format(epoch + 1))
            perform = val.val(batch_manager_val, agent, logger_val, epoch + 1)
            if is_best(best_perform, perform):
                best_perform = perform
                best_agent = deepcopy(agent)

        # Predict uncertainty with the agent over unlabeled set.
        posteriors, _, db_indices = self._compute_posteriors(
            self._db['pool'], best_agent)

        # Compute uncertainties.
        uncertainties = []
        for i, posterior in enumerate(posteriors):
            uncertainties.append(posterior[0, 1])
        _, indices = torch.sort(torch.Tensor(uncertainties), descending=True)

        return torch.LongTensor(db_indices)[indices[:self._opt.sampling_size]]
예제 #21
0
def do_train(
    model,
    data_loader,
    optimizer,
    scheduler,
    checkpointer,
    device,
    checkpoint_period,
    arguments,
):
    logger = logging.getLogger("rcnn.trainer")
    logger.info("Start training")
    meters = MetricLogger(delimiter="  ")
    max_iter = len(data_loader)
    start_iter = arguments["iteration"]
    model.train()
    start_training_time = time.time()
    end = time.time()
    max_iou = 0
    for iteration, (images, targets, _, __) in enumerate(data_loader, start_iter):
        data_time = time.time() - end
        iteration = iteration + 1
        arguments["iteration"] = iteration

        scheduler.step()

        images = images.to(device)
        targets = [target.to(device) for target in targets]

        loss_dict = model(images, targets)

        losses = sum(loss for loss in loss_dict.values())

        # reduce losses over all GPUs for logging purposes
        loss_dict_reduced = reduce_loss_dict(loss_dict)
        losses_reduced = sum(loss for loss in loss_dict_reduced.values())
        meters.update(loss=losses_reduced, **loss_dict_reduced)

        optimizer.zero_grad()
        losses.backward()
        optimizer.step()

        batch_time = time.time() - end
        end = time.time()
        meters.update(time=batch_time, data=data_time)

        eta_seconds = meters.time.global_avg * (max_iter - iteration)
        eta_string = str(datetime.timedelta(seconds=int(eta_seconds)))

        if iteration % 100 == 0 or iteration == max_iter:
            logger.info(
                meters.delimiter.join(
                    [
                        "eta: {eta}",
                        "iter: {iter}",
                        "{meters}",
                        "lr: {lr:.6f}",
                        "max mem: {memory:.0f}",
                    ]
                ).format(
                    eta=eta_string,
                    iter=iteration,
                    meters=str(meters),
                    lr=optimizer.param_groups[0]["lr"],
                    memory=torch.cuda.max_memory_allocated() / 1024.0 / 1024.0,
                )
            )
        if iteration % checkpoint_period == 0:
            # checkpointer.save("model_{:07d}".format(iteration), **arguments)
            MODEL_PATH = os.path.join(sys.path[0], 'data', 'output', 'model')
            if not os.path.exists(MODEL_PATH):
                os.makedirs(MODEL_PATH)
            torch.save(model, os.path.join(MODEL_PATH, 'last.pkl'))
            print('Save !')
            val()

        if iteration == max_iter:
            # checkpointer.save("model_final", **arguments)
            pass
    total_training_time = time.time() - start_training_time
    total_time_str = str(datetime.timedelta(seconds=total_training_time))
    logger.info(
        "Total training time: {} ({:.4f} s / it)".format(
            total_time_str, total_training_time / (max_iter)
        )
    )
예제 #22
0
파일: main.py 프로젝트: Sxela/YOLO-Nano
            model_dict = model.state_dict()
            passed_dict = ['conv9.weight','conv10.weight','conv11.weight']
            new_state_dict = OrderedDict()
            new_state_dict = {k: v for k,v in checkpoint['state_dict'].items() if k not in passed_dict}
            model_dict.update(new_state_dict)
            model.load_state_dict(model_dict)
        else:
            model.load_state_dict(checkpoint['state_dict'])

        opt.begin_epoch = checkpoint['epoch']
        model = model.to(opt.device)
        if not opt.no_train and not opt.pretrained:
            optimizer.load_state_dict(checkpoint['optimizer'])
        best_mAP = checkpoint["best_mAP"]


    ########################################
    #           Train, Val, Test           #
    ########################################
    if opt.test:
        test(model,test_dataloader,opt.begin_epoch,opt)
    else:
        for epoch in range(opt.begin_epoch, opt.num_epochs + 1):
            if not opt.no_train:
                print("\n---- Training Model ----")
                train(model,optimizer,train_dataloader,epoch,opt,train_logger, best_mAP=best_mAP)

            if not opt.no_val and (epoch+1) % opt.val_interval == 0:
                print("\n---- Evaluating Model ----")
                best_mAP = val(model,optimizer,val_dataloader,epoch,opt,val_logger,best_mAP=best_mAP)
def test_invalid():
    with pytest.raises(jsonschema.ValidationError):
        val.val(invalid, resource_schema)
예제 #24
0
def test_valid():
    val.val(valid, resource_schema)
예제 #25
0
def test_fn_if_invalid(instance):
    with pytest.raises(jsonschema.ValidationError):
        val.val(instance,
                basic_types_schema,
                definition="#/definitions/condition_functions/Fn::If")
예제 #26
0
            sys.exit()

        if (iteration % DC.save_iter == 0):
            t.save(model.state_dict(),
                   'ResNet152-iter' + str(iteration) + '.pth')

        if os.path.isfile('SAVENOW'):
            t.save(model.state_dict(),
                   'ResNet152-iter' + str(iteration) + '.pth')
            os.remove('SAVENOW')

        if DC.val_in_train and iteration % DC.val_iter == 0:
            #            print('model.training: ', model.training)
            time1 = time.time()

            val_out = val.val(True, model, val_transform, val_data,
                              val_dataloader)

            model.train()

            val_elps_time = time.time() - time1

            print(
                'Validate now, ', ' epoch:', epoch + 1, ' iter:', iteration,
                ' avg loss: {:.8f}'.format(val_out[0]),
                ' accuracy:({}/{}) {:.4f}% '.format(val_out[1], val_out[2],
                                                    val_out[3]),
                ' time: {:.3f}'.format(val_elps_time))

    if (epoch + 1 == DC.max_epoch):
        t.save(model.state_dict(), 'ResNet152-iter' + str(iteration) + '.pth')
예제 #27
0
def test_fn_getazs_valid(instance):
    val.val(instance,
            basic_types_schema,
            definition="#/definitions/functions/Fn::GetAZs")
def test_fn_join_valid(instance):
    val.val(instance, basic_types_schema,
            definition="#/definitions/functions/Fn::Join")
예제 #29
0
def test_fn_join_valid(instance):
    val.val(instance,
            basic_types_schema,
            definition="#/definitions/functions/Fn::Join")
def test_string_function_valid(instance, definition):
    val.val(instance, basic_types_schema,
            definition=definition)
예제 #31
0
def test_string_list_valid(instance):
    val.val(instance,
            basic_types_schema,
            definition="#/definitions/list<string>")
def test_lenientISO8601_invalid(instance):
    with pytest.raises(jsonschema.ValidationError):
        val.val(instance, basic_types_schema,
                definition="#/definitions/timestamp")
예제 #33
0
def test_fn_base64_valid(instance):
    val.val(instance,
            basic_types_schema,
            definition="#/definitions/functions/Fn::Base64")
def test_fn_base64_invalid(instance):
    with pytest.raises(jsonschema.ValidationError):
        val.val(instance, basic_types_schema,
                definition="#/definitions/functions/Fn::Base64")
예제 #35
0
def test_string_function_valid(instance, definition):
    val.val(instance, basic_types_schema, definition=definition)
def test_fn_if_valid(instance):
    val.val(instance, basic_types_schema,
            definition="#/definitions/condition_functions/Fn::If")
예제 #37
0
def test_lenientISO8601_valid(instance):
    val.val(instance, basic_types_schema, definition="#/definitions/timestamp")
def test_fn_getazs_valid(instance):
    val.val(instance, basic_types_schema,
            definition="#/definitions/functions/Fn::GetAZs")
예제 #39
0
def test_lenientISO8601_invalid(instance):
    with pytest.raises(jsonschema.ValidationError):
        val.val(instance,
                basic_types_schema,
                definition="#/definitions/timestamp")
예제 #40
0
def main(args):
    #===========================================================================
    # Set the file name format
    FILE_NAME_FORMAT = "{0}_{1}_{2:d}_{3:d}_{4:d}_{5:f}{6}".format(
        args.model, args.dataset, args.epochs, args.obj_step, args.batch_size,
        args.lr, args.flag)

    # Set the results file path
    RESULT_FILE_NAME = FILE_NAME_FORMAT + '_results.pkl'
    RESULT_FILE_PATH = os.path.join(RESULTS_PATH, RESULT_FILE_NAME)
    # Set the checkpoint file path
    CHECKPOINT_FILE_NAME = FILE_NAME_FORMAT + '.ckpt'
    CHECKPOINT_FILE_PATH = os.path.join(CHECKPOINT_PATH, CHECKPOINT_FILE_NAME)
    BEST_CHECKPOINT_FILE_NAME = FILE_NAME_FORMAT + '_best.ckpt'
    BEST_CHECKPOINT_FILE_PATH = os.path.join(CHECKPOINT_PATH,
                                             BEST_CHECKPOINT_FILE_NAME)

    # Set the random seed same for reproducibility
    random.seed(190811)
    torch.manual_seed(190811)
    torch.cuda.manual_seed_all(190811)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

    # Step1 ====================================================================
    # Load dataset
    if args.dataset == 'CelebA':
        dataloader = CelebA_Dataloader()
    else:
        assert False, "Please select the proper dataset."

    train_loader = dataloader.get_train_loader(batch_size=args.batch_size,
                                               num_workers=args.num_workers)
    print('==> DataLoader ready.')

    # Step2 ====================================================================
    # Make the model
    if args.model in ['WGAN', 'DCGAN']:
        generator = Generator(BN=True)
        discriminator = Discriminator(BN=True)
    elif args.model in ['WGAN_noBN', 'DCGAN_noBN']:
        generator = Generator(BN=False)
        discriminator = Discriminator(BN=False)
    else:
        assert False, "Please select the proper model."

    # Check DataParallel available
    if torch.cuda.device_count() > 1:
        generator = nn.DataParallel(generator)
        discriminator = nn.DataParallel(discriminator)

    # Check CUDA available
    if torch.cuda.is_available():
        generator.cuda()
        discriminator.cuda()
    print('==> Model ready.')

    # Step3 ====================================================================
    # Set loss function and optimizer
    if args.model in ['DCGAN', 'DCGAN_noBN']:
        criterion = nn.BCELoss()
    else:
        criterion = None
    optimizer_G = torch.optim.RMSprop(generator.parameters(), lr=args.lr)
    optimizer_D = torch.optim.RMSprop(discriminator.parameters(), lr=args.lr)
    step_counter = StepCounter(args.obj_step)
    print('==> Criterion and optimizer ready.')

    # Step4 ====================================================================
    # Train and validate the model
    start_epoch = 0
    best_metric = float("inf")
    validate_noise = torch.randn(args.batch_size, 100, 1, 1)

    # Initialize the result lists
    train_loss_G = []
    train_loss_D = []
    train_distance = []

    if args.resume:
        assert os.path.exists(CHECKPOINT_FILE_PATH), 'No checkpoint file!'
        checkpoint = torch.load(CHECKPOINT_FILE_PATH)
        generator.load_state_dict(checkpoint['generator_state_dict'])
        discriminator.load_state_dict(checkpoint['discriminator_state_dict'])
        optimizer_G.load_state_dict(checkpoint['optimizer_G_state_dict'])
        optimizer_D.load_state_dict(checkpoint['optimizer_D_state_dict'])
        start_epoch = checkpoint['epoch']
        step_counter.current_step = checkpoint['current_step']
        train_loss_G = checkpoint['train_loss_G']
        train_loss_D = checkpoint['train_loss_D']
        train_distance = checkpoint['train_distance']
        best_metric = checkpoint['best_metric']

    # Save the training information
    result_data = {}
    result_data['model'] = args.model
    result_data['dataset'] = args.dataset
    result_data['target_epoch'] = args.epochs
    result_data['batch_size'] = args.batch_size

    # Check the directory of the file path
    if not os.path.exists(os.path.dirname(RESULT_FILE_PATH)):
        os.makedirs(os.path.dirname(RESULT_FILE_PATH))
    if not os.path.exists(os.path.dirname(CHECKPOINT_FILE_PATH)):
        os.makedirs(os.path.dirname(CHECKPOINT_FILE_PATH))

    print('==> Train ready.')

    # Validate before training (step 0)
    val(generator, validate_noise, step_counter, FILE_NAME_FORMAT)

    for epoch in range(args.epochs):
        # strat after the checkpoint epoch
        if epoch < start_epoch:
            continue
        print("\n[Epoch: {:3d}/{:3d}]".format(epoch + 1, args.epochs))
        epoch_time = time.time()
        #=======================================================================
        # train the model (+ validate the model)
        tloss_G, tloss_D, tdist = train(generator, discriminator, train_loader,
                                        criterion, optimizer_G, optimizer_D,
                                        args.clipping, args.num_critic,
                                        step_counter, validate_noise,
                                        FILE_NAME_FORMAT)
        train_loss_G.extend(tloss_G)
        train_loss_D.extend(tloss_D)
        train_distance.extend(tdist)
        #=======================================================================
        current = time.time()

        # Calculate average loss
        avg_loss_G = sum(tloss_G) / len(tloss_G)
        avg_loss_D = sum(tloss_D) / len(tloss_D)
        avg_distance = sum(tdist) / len(tdist)

        # Save the current result
        result_data['current_epoch'] = epoch
        result_data['train_loss_G'] = train_loss_G
        result_data['train_loss_D'] = train_loss_D
        result_data['train_distance'] = train_distance

        # Save result_data as pkl file
        with open(RESULT_FILE_PATH, 'wb') as pkl_file:
            pickle.dump(result_data,
                        pkl_file,
                        protocol=pickle.HIGHEST_PROTOCOL)

        # Save the best checkpoint
        # if avg_distance < best_metric:
        #     best_metric = avg_distance
        #     torch.save({
        #         'epoch': epoch+1,
        #         'generator_state_dict': generator.state_dict(),
        #         'discriminator_state_dict': discriminator.state_dict(),
        #         'optimizer_G_state_dict': optimizer_G.state_dict(),
        #         'optimizer_D_state_dict': optimizer_D.state_dict(),
        #         'current_step': step_counter.current_step,
        #         'best_metric': best_metric,
        #         }, BEST_CHECKPOINT_FILE_PATH)

        # Save the current checkpoint
        torch.save(
            {
                'epoch': epoch + 1,
                'generator_state_dict': generator.state_dict(),
                'discriminator_state_dict': discriminator.state_dict(),
                'optimizer_G_state_dict': optimizer_G.state_dict(),
                'optimizer_D_state_dict': optimizer_D.state_dict(),
                'current_step': step_counter.current_step,
                'train_loss_G': train_loss_G,
                'train_loss_D': train_loss_D,
                'train_distance': train_distance,
                'best_metric': best_metric,
            }, CHECKPOINT_FILE_PATH)

        # Print the information on the console
        print("model                : {}".format(args.model))
        print("dataset              : {}".format(args.dataset))
        print("batch_size           : {}".format(args.batch_size))
        print("current step         : {:d}".format(step_counter.current_step))
        print("current lrate        : {:f}".format(args.lr))
        print("gen/disc loss        : {:f}/{:f}".format(
            avg_loss_G, avg_loss_D))
        print("distance metric      : {:f}".format(avg_distance))
        print("epoch time           : {0:.3f} sec".format(current -
                                                          epoch_time))
        print("Current elapsed time : {0:.3f} sec".format(current - start))

        # If iteration step has been satisfied
        if step_counter.exit_signal:
            break

    print('==> Train done.')

    print(' '.join(['Results have been saved at', RESULT_FILE_PATH]))
    print(' '.join(['Checkpoints have been saved at', CHECKPOINT_FILE_PATH]))
예제 #41
0
def test_fn_base64_invalid(instance):
    with pytest.raises(jsonschema.ValidationError):
        val.val(instance,
                basic_types_schema,
                definition="#/definitions/functions/Fn::Base64")
예제 #42
0
def main():
    parser = argparse.ArgumentParser(description='SegTransformer training')
    parser.add_argument('--config', type=str, required=True)
    args = parser.parse_args()

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    config = load_config_yaml(args.config)
    data_config = load_config_yaml(config['data_config'])

    now = datetime.datetime.now()
    date_time = now.strftime("%Y-%m-%d-%H-%M")
    os.makedirs(config['logging_dir'], exist_ok=True)
    logging_path = os.path.join(config['logging_dir'],
                                f'logging_train_{date_time}.txt')
    logger = create_logger(logging_path, stdout=False)

    ###################################################################################
    # construct net
    ###################################################################################
    n_channel = data_config['dataset']['2d']['n_slice']
    n_class = len(data_config['dataset']['3d']['roi_names'])
    if data_config['dataset']['3d']['with_issue_air_mask']:
        n_class += 2
    start_channel = int(config['start_channel'])
    logger.info(
        f'create model with n_channel={n_channel}, start_channel={start_channel}, n_class={n_class}'
    )

    model = SegTransformer(
        n_channel=n_channel,
        start_channel=start_channel,
        n_class=n_class,
        deep_supervision=config["deep_supervision"]).to(device)

    logger.info(f"model_dir: {config['ckpt_dir']}")

    ###################################################################################
    # criterion, optimizer, scheduler
    ###################################################################################
    criterion = Criterion(config)
    optimizer = torch.optim.AdamW(model.parameters(),
                                  lr=config['lr'],
                                  weight_decay=config['weight_decay'])
    scheduler = torch.optim.lr_scheduler.StepLR(optimizer,
                                                step_size=config['step_size'])
    if config['deep_supervision']:
        logger.info('Train model using deep supervision')
    else:
        logger.info('Train model using deep supervision')

    ###################################################################################
    # SummaryWriter
    ###################################################################################
    logger.info("Creating writer")
    writer = SummaryWriter(comment=f"LR_{config['lr']}_BS_{config['n_epoch']}")

    ###################################################################################
    # train setup
    ###################################################################################
    global_step = 0
    best_loss = np.inf
    epoch_start = 0

    ###################################################################################
    # load previous model
    ###################################################################################
    if config['load_checkpoint']:
        logger.info(
            f'Loading model from {os.path.join(config["ckpt_dir"], config["ckpt_fn"])}...'
        )
        model, optimizer, scheduler, epoch_start, global_step = load_checkpoint(
            model, optimizer, scheduler, config['ckpt_dir'], config['ckpt_fn'],
            device)
    elif config['load_checkpoint_encoder']:
        logger.info(
            f'Loading encoder from {os.path.join(config["ckpt_dir"], config["ckpt_fn"])}...'
        )
        model.encoder = load_checkpoint_encoder(model.encoder,
                                                ckpt_dir=config['ckpt_dir'],
                                                ckpt_fn=config['ckpt_fn'],
                                                device=device)
        if config['freeze_encoder']:
            logger.info('Freeze encoder')
            freeze(model.encoder)
    elif config['load_checkpoint_decoder']:
        logger.info(
            f'Loading decoder from {os.path.join(config["ckpt_dir"], config["ckpt_fn"])}...'
        )
        model.decoder = load_checkpoint_decoder(model.decoder,
                                                ckpt_dir=config['ckpt_dir'],
                                                ckpt_fn=config['ckpt_fn'],
                                                device=device)
        if config['freeze_decoder']:
            logger.info('Freeze decoder')
            freeze(model.decoder)

    ###################################################################################
    # parallel model and data
    ###################################################################################
    if torch.cuda.device_count() > 1:
        print("Let's use", torch.cuda.device_count(), "GPUs!")
        # dim = 0 [30, xxx] -> [10, ...], [10, ...], [10, ...] on 3 GPUs
        model = torch.nn.DataParallel(model)

    ###################################################################################
    # Dataset
    ###################################################################################
    dataloader_3d = create_loader_3d(data_config, 'train')
    ###################################################################################
    # train
    ###################################################################################
    logger.info(f'Starting training from epoch: {epoch_start}')
    for epoch in range(epoch_start, config['n_epoch']):
        logger.info(f"Epoch: {epoch}/{config['n_epoch']}")
        epoch_loss = 0
        epoch_loss_focal = 0
        epoch_loss_dice = 0
        n_batch_3d = len(dataloader_3d)
        with tqdm(total=n_batch_3d,
                  desc=f"Epoch {epoch + 1}/{config['n_epoch']}",
                  unit='batch') as pbar:
            for batch_3d in dataloader_3d:
                dataloader_2d = create_loader_2d(batch_3d, data_config,
                                                 'train')
                n_batch_2d = len(dataloader_2d)
                for idx, batch_2d in enumerate(dataloader_2d):
                    img = batch_2d['img'].to(
                        device=device,
                        dtype=torch.float32)  # [N, n_channel, H, W]
                    mask_gt = batch_2d['mask'].to(
                        device=device, dtype=torch.float32)  # [N, H, W]
                    mask_pred = model(img)
                    mask_flag = batch_2d['mask_flag'].to(device=device,
                                                         dtype=torch.float32)

                    loss, loss_dict = criterion(pred=mask_pred,
                                                target=mask_gt,
                                                target_roi_weight=mask_flag)
                    optimizer.zero_grad()
                    loss.backward()
                    torch.nn.utils.clip_grad_value_(model.parameters(), 0.01)
                    optimizer.step()

                    global_step += 1
                    loss_scalar = loss_dict["loss"]
                    loss_focal_scalar = loss_dict["focal_loss"]
                    loss_dice_scalar = loss_dict["dice_loss"]
                    epoch_loss += loss_scalar
                    epoch_loss_focal += loss_focal_scalar
                    epoch_loss_dice += loss_dice_scalar

                    pbar.set_postfix(
                        **{
                            'loss (batch)': loss_scalar,
                            'loss_focal': loss_focal_scalar,
                            'loss_dice': loss_dice_scalar,
                            'global_step': global_step
                        })

                    if (global_step + 1) % (
                            config['write_summary_loss_batch_step']) == 0:
                        logger.info(
                            f"\tBatch: {idx}/{n_batch_2d}, Loss: {loss_scalar}, Focal_loss: {loss_focal_scalar}, Dice_loss: {loss_dice_scalar}"
                        )
                        writer.add_scalar('Loss/train', loss_scalar,
                                          global_step)
                        writer.add_scalar('Loss/train_focal',
                                          loss_focal_scalar, global_step)
                        writer.add_scalar('Loss/train_dice', loss_dice_scalar,
                                          global_step)
                    if (global_step +
                            1) % (config['write_summary_2d_batch_step']) == 0:
                        writer.add_images(
                            'train/images',
                            torch.unsqueeze(img[:, n_channel // 2], 1),
                            global_step)
                        writer.add_images(
                            'train/gt_masks',
                            torch.sum(mask_gt, dim=1, keepdim=True),
                            global_step)
                        writer.add_images(
                            'train/pred_masks',
                            torch.sum(mask_pred[0] > 0, dim=1, keepdim=True) >=
                            1, global_step)
                        writer.add_images(
                            'train/pred_masks_raw',
                            torch.sum(mask_pred[0], dim=1, keepdim=True),
                            global_step)
                pbar.update()

            scheduler.step()
            # log epoch loss
            if (epoch + 1) % config['logging_epoch_step'] == 0:
                writer.add_scalar('lr', optimizer.param_groups[0]['lr'], epoch)
                writer.add_scalar('Loss_epoch/train', epoch_loss, epoch)
                writer.add_scalar('Loss_epoch/train_focal', epoch_loss_focal,
                                  epoch)
                writer.add_scalar('Loss_epoch/train_dice', epoch_loss_dice,
                                  epoch)
                logger.info(
                    f"Epoch: {epoch}/{config['n_epoch']}, Train Loss: {epoch_loss}, Train Loss BCE: {epoch_loss_focal}, Train Loss DSC: {epoch_loss_dice}"
                )

            # validation and save model
            if (epoch + 1) % config['val_model_epoch_step'] == 0:
                val_loss, val_focal_loss, val_dice_loss = val(
                    model, criterion, data_config, n_channel, logger, writer,
                    global_step, device)
                writer.add_scalar('Loss_epoch/val', val_loss, epoch)
                writer.add_scalar('Loss_epoch/val_focal', val_focal_loss,
                                  epoch)
                writer.add_scalar('Loss_epoch/val_dice', val_dice_loss, epoch)
                logger.info(
                    f"Epoch: {epoch}/{config['n_epoch']}, Validation Loss: {val_loss}, Validation Loss Focal: {val_focal_loss}, Validation Loss Dice: {val_dice_loss}"
                )

                os.makedirs(config['ckpt_dir'], exist_ok=True)
                save_checkpoint(model=model,
                                optimizer=optimizer,
                                scheduler=scheduler,
                                epoch=epoch,
                                global_step=global_step,
                                ckpt_dir=config['ckpt_dir'],
                                ckpt_fn=f'ckpt_{date_time}_Epoch_{epoch}.ckpt')

                if best_loss > val_loss:
                    best_loss = val_loss
                    for filename in glob.glob(
                            os.path.join(config['ckpt_dir'], "best_ckpt*")):
                        os.remove(filename)
                    save_checkpoint(
                        model=model,
                        optimizer=optimizer,
                        scheduler=scheduler,
                        epoch=epoch,
                        global_step=global_step,
                        ckpt_dir=config['ckpt_dir'],
                        ckpt_fn=f'best_ckpt_{date_time}_epoch_{epoch}.ckpt')

        if config['freeze_encoder'] and config[
                'unfreeze_encoder_epoch'] is not None:
            if epoch >= int(config['unfreeze_encoder_epoch']):
                unfreeze(model.module.encoder)
                config['unfreeze_encoder_epoch'] = None
                logger.info(f'Unfreeze encoder at {epoch}')
        if config['freeze_decoder'] and config[
                'unfreeze_decoder_epoch'] is not None:
            if epoch >= int(config['unfreeze_decoder_epoch']):
                unfreeze(model.module.decoder)
                config['unfreeze_decoder_epoch'] = None
                logger.info(f'Unfreeze decoder at {epoch}')
    writer.close()
예제 #43
0
def test_fn_findinmap_valid(instance):
    val.val(instance,
            basic_types_schema,
            definition="#/definitions/functions/Fn::FindInMap")
예제 #44
0
def main(args):
    #===========================================================================
    # Set the file name format
    FILE_NAME_FORMAT = "{0}_{1}_{2:d}_{3:d}_{4:d}_{5:d}_{6:d}_{7:d}{8}".format(
                                    args.model, args.dataset,
                                    args.batch_size, args.dim_model,
                                    args.dim_ff, args.dim_KV, args.num_layers,
                                    args.num_heads, args.flag)

    # Set the results file path
    RESULT_FILE_NAME = FILE_NAME_FORMAT+'_results.pkl'
    RESULT_FILE_PATH = os.path.join(RESULTS_PATH, RESULT_FILE_NAME)
    # Set the checkpoint file path
    CHECKPOINT_FILE_NAME = FILE_NAME_FORMAT+'.ckpt'
    CHECKPOINT_FILE_PATH = os.path.join(CHECKPOINT_PATH, CHECKPOINT_FILE_NAME)
    BEST_CHECKPOINT_FILE_NAME = FILE_NAME_FORMAT+'_best.ckpt'
    BEST_CHECKPOINT_FILE_PATH = os.path.join(CHECKPOINT_PATH,
                                                BEST_CHECKPOINT_FILE_NAME)

    # Set the random seed same for reproducibility
    random.seed(190811)
    torch.manual_seed(190811)
    torch.cuda.manual_seed_all(190811)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

    # Setting constants
    dim_model = args.dim_model      # Dimension of model (=Embedding size)
    dim_ff = args.dim_ff            # Dimension of FeedForward
    dim_K = args.dim_KV             # Dimension of Key(=Query)
    dim_V = args.dim_KV             # Dimension of Value
    num_layers = args.num_layers    # Number of Encoder of Decoder Layer
    num_heads = args.num_heads      # Number of heads in Multi-Head Attention
    dropout_p = args.dropout_p      # Dropout probability
    warmup_steps = 4000             # Warming up learnimg rate steps
    label_smoothing_eps = 0.1       # Label smoothing epsilon
    max_src_len = 46                # Maximum source input length (Multi30k)
    max_trg_len = 45                # Maximum target input length (Multi30k)

    # Step1 ====================================================================
    # Load dataset
    if args.dataset == 'WMT2014':
        dataloader = WMT2014_Dataloader()
    elif args.dataset == 'Multi30k':
        dataloader = Multi30k_Dataloader()
    else:
        assert False, "Please select the proper dataset."

    train_loader = dataloader.get_train_loader(batch_size=args.batch_size)
    val_loader = dataloader.get_val_loader(batch_size=args.batch_size)
    print('==> DataLoader ready.')

    # Step2 ====================================================================
    # Make Translation model
    if args.model == 'Transformer':
        src_vocab_size = len(dataloader.SRC.vocab)
        trg_vocab_size = len(dataloader.TRG.vocab)
        model = Transformer(src_vocab_size, trg_vocab_size,
                            max_src_len, max_trg_len, dim_model, dim_K,
                            num_layers, num_heads, dim_ff, dropout_p)
    else:
        assert False, "Please select the proper model."

    # Check DataParallel available
    if torch.cuda.device_count() > 1:
        model = nn.DataParallel(model)

    # Check CUDA available
    if torch.cuda.is_available():
        model.cuda()

    print('==> Model ready.')

    # Step3 ====================================================================
    # Set loss function and optimizer (+ lrate scheduler)
    if args.smoothing:
        criterion = Criterion_LabelSmoothing(vocab_size=trg_vocab_size,
                                            padding_idx=dataloader.pad_idx,
                                            smoothing_eps=label_smoothing_eps)
    else:
        criterion = nn.CrossEntropyLoss(ignore_index=dataloader.pad_idx)
    optimizer = optim.Adam(model.parameters(), betas=(0.9, 0.98), eps=1e-9)
    lr_scheduler = Warmup_scheduler(optimizer, dim_model, warmup_steps)
    print('==> Criterion and optimizer ready.')

    # Step4 ====================================================================
    # Train and validate the model
    start_epoch = 0
    best_val_metric = 0

    if args.resume:
        assert os.path.exists(CHECKPOINT_FILE_PATH), 'No checkpoint file!'
        checkpoint = torch.load(CHECKPOINT_FILE_PATH)
        model.load_state_dict(checkpoint['model_state_dict'])
        optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
        start_epoch = checkpoint['epoch']
        lr_scheduler.current_step = checkpoint['current_step']
        best_val_metric = checkpoint['best_val_metric']

    # Save the training information
    result_data = {}
    result_data['model']            = args.model
    result_data['dataset']          = args.dataset
    result_data['target epoch']     = args.epochs
    result_data['batch_size']       = args.batch_size

    # Initialize the result lists
    train_loss = []
    train_ppl = []
    train_bleu = []

    val_loss = []
    val_ppl = []
    val_bleu = []

    # Check the directory of the file path
    if not os.path.exists(os.path.dirname(RESULT_FILE_PATH)):
        os.makedirs(os.path.dirname(RESULT_FILE_PATH))
    if not os.path.exists(os.path.dirname(CHECKPOINT_FILE_PATH)):
        os.makedirs(os.path.dirname(CHECKPOINT_FILE_PATH))
    print('==> Train ready.')

    for epoch in range(args.epochs):
        # strat after the checkpoint epoch
        if epoch < start_epoch:
            continue
        print("\n[Epoch: {:3d}/{:3d}]".format(epoch+1, args.epochs))
        epoch_time = time.time()
        #=======================================================================
        # train the model
        tloss, tmetric = train(model, train_loader, criterion,
                                    optimizer, lr_scheduler, dataloader)
        train_loss.append(tloss)
        train_ppl.append(tmetric[0])
        train_bleu.append(tmetric[1])

        # validate the model
        vloss, vmetric = val(model, val_loader, criterion, dataloader)
        val_loss.append(vloss)
        val_ppl.append(vmetric[0])
        val_bleu.append(vmetric[1])

        #=======================================================================
        current = time.time()

        # Save the current result
        result_data['current epoch']    = epoch
        result_data['train_loss']       = train_loss
        result_data['train_ppl']        = train_ppl
        result_data['train_bleu']       = train_bleu
        result_data['val_loss']         = val_loss
        result_data['val_ppl']          = val_ppl
        result_data['val_bleu']         = val_bleu

        # Save result_data as pkl file
        with open(RESULT_FILE_PATH, 'wb') as pkl_file:
            pickle.dump(result_data, pkl_file, protocol=pickle.HIGHEST_PROTOCOL)

        # Save the best checkpoint
        if vmetric[1] > best_val_metric:
            best_val_metric = vmetric[1]
            torch.save({
                'epoch': epoch+1,
                'model_state_dict': model.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
                'current_step': lr_scheduler.current_step,
                'best_val_metric': best_val_metric,
                }, BEST_CHECKPOINT_FILE_PATH)

        # Save the current checkpoint
        torch.save({
            'epoch': epoch+1,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'current_step': lr_scheduler.current_step,
            'val_metric': vmetric[0],
            # 'best_val_metric': best_val_metric,
            }, CHECKPOINT_FILE_PATH)

        # Print the information on the console
        print("model                : {}".format(args.model))
        print("dataset              : {}".format(args.dataset))
        print("batch_size           : {}".format(args.batch_size))
        print("current step         : {:d}".format(lr_scheduler.current_step))
        print("current lrate        : {:f}".format(optimizer.param_groups[0]['lr']))
        print("train/val loss       : {:f}/{:f}".format(tloss,vloss))
        print("train/val PPL        : {:f}/{:f}".format(tmetric[0],vmetric[0]))
        print("train/val BLEU       : {:f}/{:f}".format(tmetric[1],vmetric[1]))
        print("epoch time           : {0:.3f} sec".format(current - epoch_time))
        print("Current elapsed time : {0:.3f} sec".format(current - start))
    print('==> Train done.')

    print(' '.join(['Results have been saved at', RESULT_FILE_PATH]))
    print(' '.join(['Checkpoints have been saved at', CHECKPOINT_FILE_PATH]))
예제 #45
0
def test_fn_if_valid(instance):
    val.val(instance,
            basic_types_schema,
            definition="#/definitions/condition_functions/Fn::If")