예제 #1
0
파일: run.py 프로젝트: mlzxy/dac
def train():
    if not os.path.isdir(save_dir):
        os.makedirs(save_dir)

    model.load_from_ckpt()

    # save hyperparams
    with open(os.path.join(save_dir, 'args.json'), 'w') as f:
        json.dump(args.__dict__, f, sort_keys=True, indent=4)

    optimizer, scheduler = model.build_optimizer()
    logger = get_logger(exp_id, os.path.join(save_dir,
        'train_'+time.strftime('%Y%m%d-%H%M')+'.log'))
    accm = Accumulator(*model.metrics)
    train_accm = Accumulator('loss')

    tick = time.time()
    for t, batch in enumerate(train_loader, 1):
        net.train()
        optimizer.zero_grad()
        loss = model.loss_fn(batch)
        loss.backward()
        nn.utils.clip_grad_norm_(net.parameters(), args.clip)
        optimizer.step()
        scheduler.step()
        train_accm.update(loss.item())

        if t % args.test_freq == 0:
            line = 'step {}, lr {:.3e}, train loss {:.4f}, '.format(
                    t, optimizer.param_groups[0]['lr'], train_accm.get('loss'))
            line += test(accm=accm, verbose=False)
            logger.info(line)
            accm.reset()
            train_accm.reset()

        if t % args.save_freq == 0:
            if args.save_all:
                torch.save(net.state_dict(),
                        os.path.join(save_dir, 'model{}.tar'.format(t)))
            torch.save(net.state_dict(), os.path.join(save_dir, 'model.tar'))

    torch.save(net.state_dict(), os.path.join(save_dir, 'model.tar'))
예제 #2
0
파일: run.py 프로젝트: OpenXAIProject/dac
def train():
    if not os.path.isdir(save_dir):
        os.makedirs(save_dir)

    # save hyperparams
    with open(os.path.join(save_dir, 'args.json'), 'w') as f:
        json.dump(args.__dict__, f, sort_keys=True, indent=4)

    model.build_train_loader()
    model.build_test_loader()
    model.build_optimizer()

    train_accm = Accumulator(*model.train_metrics)
    test_accm = Accumulator(*model.test_metrics)
    logger = get_logger(
        exp_id,
        os.path.join(save_dir,
                     'train_' + time.strftime('%Y%m%d-%H%M') + '.log'))

    for t, batch in enumerate(model.train_loader, 1):
        model.train_batch(batch, train_accm)

        if t % args.test_freq == 0:
            line = 'step {}, '.format(t)
            line += model.get_lr_string() + ', '
            line += train_accm.info(header='train', show_et=False)
            model.test(test_accm)
            line += test_accm.info(header='test', )
            logger.info(line)
            train_accm.reset()
            test_accm.reset()

        if t % args.save_freq == 0:
            model.save(os.path.join(save_dir, 'model.tar'))

    model.save(os.path.join(save_dir, 'model.tar'))