示例#1
0
def train(output_filename, model_type, hidden_size, loss_type, norm_type,
          sigma_noise):
    train_data = torchvision.datasets.MNIST(
        root='datasets/mnist/',
        train=True,
        transform=torchvision.transforms.ToTensor(),
        download=False,
    )

    train_loader = Data.DataLoader(dataset=train_data,
                                   batch_size=BATCH_SIZE,
                                   shuffle=True)

    if loss_type == 'l2':
        loss_func = nn.MSELoss()
    elif loss_type == 'cross_entropy':
        loss_func = F.binary_cross_entropy

    if model_type == 'AE':
        model = AutoEncoder(hidden_size).cuda()
    elif model_type == 'LTAE':
        model = LatentAutoEncoder(hidden_size, norm_type,
                                  sigma=sigma_noise).cuda()
        model.set_device()
    elif model_type == 'VAE':
        model = VariationalAE(hidden_size).cuda()

    optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

    model.train()
    for epoch in range(EPOCH):
        for step, (x, _) in enumerate(train_loader):
            optimizer.zero_grad()

            x_batch = x.view(-1, 28 * 28).cuda()
            y_batch = x.view(-1, 28 * 28).cuda()

            if model_type == 'AE':
                _, decoded = model(x_batch)
                loss = loss_func(decoded, y_batch)
            elif model_type == 'LTAE':
                _, latent, transformed, decoded = model(x_batch)
                loss = loss_func(decoded, y_batch)
                loss += torch.nn.functional.mse_loss(transformed, latent)
            elif model_type == 'VAE':
                decoded, mu, logvar = model(x_batch)
                loss = loss_func_vae(decoded, x_batch, mu, logvar, loss_type)

            loss.backward()
            optimizer.step()

        if epoch % 10 == 0:
            print('Epoch: ', epoch, '| train loss: %.4f' % loss.detach().cpu())

    torch.save({'state_dict': model.state_dict()},
               f'./saved_models/{output_filename}')
示例#2
0
def main(args):
    device = torch.device(
        'cuda' if torch.cuda.is_available() and not args.cpu else 'cpu')
    print('Using %s device.' % device)

    world_size = int(
        os.environ[args.env_size]) if args.env_size in os.environ else 1
    local_rank = int(
        os.environ[args.env_rank]) if args.env_rank in os.environ else 0

    if local_rank == 0:
        print(vars(args))

    if world_size > 1:
        print('rank: {}/{}'.format(local_rank + 1, world_size))
        torch.distributed.init_process_group(backend='gloo',
                                             init_method='file://%s' %
                                             args.tmpname,
                                             rank=local_rank,
                                             world_size=world_size)

    train_dataloader, test_dataloader = load_dataset(args, device, world_size)

    net = AutoEncoder(input_dim=1900, nlayers=args.nlayers,
                      latent=100).to(device)

    if world_size > 1:
        net = torch.nn.parallel.DistributedDataParallel(net)

    if args.modelfile:
        net.load_state_dict(torch.load(args.modelfile))

    # define our optimizer and loss function
    optimizer = torch.optim.Adam(net.parameters(),
                                 lr=args.lr,
                                 weight_decay=args.weight_decay)
    loss_func = nn.MSELoss(reduction='mean')

    test_losses = []

    for epoch in range(args.epochs):
        epoch_start = timeit.default_timer()

        train(train_dataloader, net, optimizer, loss_func, epoch)
        test_loss = test(test_dataloader, net, loss_func)

        print(' %5.2f sec' % (timeit.default_timer() - epoch_start))

        test_losses.append(test_loss)

        if test_loss <= min(test_losses):
            torch.save(net.state_dict(), 'model/%5.3f.pth' % min(test_losses))
示例#3
0
def main(args):
    # ensures that weight initializations are all the same
    torch.manual_seed(args.seed)
    np.random.seed(args.seed)
    torch.cuda.manual_seed(args.seed)
    torch.cuda.manual_seed_all(args.seed)

    logging = utils.Logger(args.global_rank, args.save)
    writer = utils.Writer(args.global_rank, args.save)

    # Get data loaders.
    train_queue, valid_queue, num_classes, _ = datasets.get_loaders(args)
    args.num_total_iter = len(train_queue) * args.epochs
    warmup_iters = len(train_queue) * args.warmup_epochs
    swa_start = len(train_queue) * (args.epochs - 1)

    arch_instance = utils.get_arch_cells(args.arch_instance)

    model = AutoEncoder(args, writer, arch_instance)
    model = model.cuda()

    logging.info('args = %s', args)
    logging.info('param size = %fM ', utils.count_parameters_in_M(model))
    logging.info('groups per scale: %s, total_groups: %d',
                 model.groups_per_scale, sum(model.groups_per_scale))

    if args.fast_adamax:
        # Fast adamax has the same functionality as torch.optim.Adamax, except it is faster.
        cnn_optimizer = Adamax(model.parameters(),
                               args.learning_rate,
                               weight_decay=args.weight_decay,
                               eps=1e-3)
    else:
        cnn_optimizer = torch.optim.Adamax(model.parameters(),
                                           args.learning_rate,
                                           weight_decay=args.weight_decay,
                                           eps=1e-3)

    cnn_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(
        cnn_optimizer,
        float(args.epochs - args.warmup_epochs - 1),
        eta_min=args.learning_rate_min)
    grad_scalar = GradScaler(2**10)

    num_output = utils.num_output(args.dataset, args)
    bpd_coeff = 1. / np.log(2.) / num_output

    # if load
    checkpoint_file = os.path.join(args.save, 'checkpoint.pt')
    if args.cont_training:
        logging.info('loading the model.')
        checkpoint = torch.load(checkpoint_file, map_location='cpu')
        init_epoch = checkpoint['epoch']
        model.load_state_dict(checkpoint['state_dict'])
        model = model.cuda()
        cnn_optimizer.load_state_dict(checkpoint['optimizer'])
        grad_scalar.load_state_dict(checkpoint['grad_scalar'])
        cnn_scheduler.load_state_dict(checkpoint['scheduler'])
        global_step = checkpoint['global_step']
    else:
        global_step, init_epoch = 0, 0

    for epoch in range(init_epoch, args.epochs):
        # update lrs.
        if args.distributed:
            train_queue.sampler.set_epoch(global_step + args.seed)
            valid_queue.sampler.set_epoch(0)

        if epoch > args.warmup_epochs:
            cnn_scheduler.step()

        # Logging.
        logging.info('epoch %d', epoch)

        # Training.
        train_nelbo, global_step = train(train_queue, model, cnn_optimizer,
                                         grad_scalar, global_step,
                                         warmup_iters, writer, logging)
        logging.info('train_nelbo %f', train_nelbo)
        writer.add_scalar('train/nelbo', train_nelbo, global_step)

        model.eval()
        # generate samples less frequently
        eval_freq = 1 if args.epochs <= 50 else 20
        if epoch % eval_freq == 0 or epoch == (args.epochs - 1):
            with torch.no_grad():
                num_samples = 16
                n = int(np.floor(np.sqrt(num_samples)))
                for t in [0.7, 0.8, 0.9, 1.0]:
                    logits = model.sample(num_samples, t)
                    output = model.decoder_output(logits)
                    output_img = output.mean if isinstance(
                        output, torch.distributions.bernoulli.Bernoulli
                    ) else output.sample(t)
                    output_tiled = utils.tile_image(output_img, n)
                    writer.add_image('generated_%0.1f' % t, output_tiled,
                                     global_step)

            valid_neg_log_p, valid_nelbo = test(valid_queue,
                                                model,
                                                num_samples=10,
                                                args=args,
                                                logging=logging)
            logging.info('valid_nelbo %f', valid_nelbo)
            logging.info('valid neg log p %f', valid_neg_log_p)
            logging.info('valid bpd elbo %f', valid_nelbo * bpd_coeff)
            logging.info('valid bpd log p %f', valid_neg_log_p * bpd_coeff)
            writer.add_scalar('val/neg_log_p', valid_neg_log_p, epoch)
            writer.add_scalar('val/nelbo', valid_nelbo, epoch)
            writer.add_scalar('val/bpd_log_p', valid_neg_log_p * bpd_coeff,
                              epoch)
            writer.add_scalar('val/bpd_elbo', valid_nelbo * bpd_coeff, epoch)

        save_freq = int(np.ceil(args.epochs / 100))
        if epoch % save_freq == 0 or epoch == (args.epochs - 1):
            if args.global_rank == 0:
                logging.info('saving the model.')
                torch.save(
                    {
                        'epoch': epoch + 1,
                        'state_dict': model.state_dict(),
                        'optimizer': cnn_optimizer.state_dict(),
                        'global_step': global_step,
                        'args': args,
                        'arch_instance': arch_instance,
                        'scheduler': cnn_scheduler.state_dict(),
                        'grad_scalar': grad_scalar.state_dict()
                    }, checkpoint_file)

    # Final validation
    valid_neg_log_p, valid_nelbo = test(valid_queue,
                                        model,
                                        num_samples=1000,
                                        args=args,
                                        logging=logging)
    logging.info('final valid nelbo %f', valid_nelbo)
    logging.info('final valid neg log p %f', valid_neg_log_p)
    writer.add_scalar('val/neg_log_p', valid_neg_log_p, epoch + 1)
    writer.add_scalar('val/nelbo', valid_nelbo, epoch + 1)
    writer.add_scalar('val/bpd_log_p', valid_neg_log_p * bpd_coeff, epoch + 1)
    writer.add_scalar('val/bpd_elbo', valid_nelbo * bpd_coeff, epoch + 1)
    writer.close()
示例#4
0
def main():

    cuda_available = torch.cuda.is_available()
    train_params, dataset_params = get_arguments()
    net = AutoEncoder()
    epoch_trained = 0
    if train_params['restore_model']:
        net = load_model(net, train_params['restore_dir'],
                         train_params['restore_model'])
        if net is None:
            print("Initialize network and train from scratch.")
            net = AutoEncoder()
        else:
            epoch_trained = 0

    train_loader, validation = audio_data_loader(**dataset_params)

    if cuda_available is False:
        warnings.warn(
            "Cuda is not avalable, can not train model using multi-gpu.")
    if cuda_available:
        # Remove train_params["device_ids"] for single GPU
        if train_params["device_ids"]:
            batch_size = dataset_params["batch_size"]
            num_gpu = len(train_params["device_ids"])
            assert batch_size % num_gpu == 0
            net = nn.DataParallel(net, device_ids=train_params['device_ids'])
        torch.backends.cudnn.benchmark = True
        net = net.cuda()

    criterion = nn.MSELoss()
    optimizer = get_optimizer(net, train_params['optimizer'],
                              train_params['learning_rate'],
                              train_params['momentum'])

    if cuda_available:
        criterion = criterion.cuda()
    if not os.path.exists(train_params['log_dir']):
        os.makedirs(train_params['log_dir'])
    if not os.path.exists(train_params['restore_dir']):
        os.makedirs(train_params['restore_dir'])
    train_loss_log_file = open(train_params['log_dir'] + 'train_loss_log.log',
                               'a')
    test_loss_log_file = open(train_params['log_dir'] + 'test_loss_log.log',
                              'a')

    # Add print for start of training time
    time = str(datetime.now())
    line = 'Training Started at' + str(time) + ' !!! \n'
    train_loss_log_file.writelines(line)
    train_loss_log_file.flush()

    # Keep track of losses
    train_losses = []
    eval_losses = []
    best_eval = float('inf')

    # Begin!
    for epoch in range(train_params['num_epochs']):
        train(net, criterion, optimizer, train_losses, train_params,
              train_loss_log_file, train_loader, cuda_available)
        eval_loss = evaluate(net, criterion, epoch, eval_losses, validation,
                             test_loss_log_file, cuda_available)
        if eval_loss < best_eval:

            save_model(net, 1, train_params['restore_dir'])

            torch.save(net.state_dict(),
                       train_params['restore_dir'] + 'bestmodel.pth')
            best_eval = eval_loss

        save_model(net, epoch_trained + epoch + 1, train_params['restore_dir'])
        torch.save([train_losses, eval_losses, epoch],
                   train_params['restore_dir'] + 'data_params')

    # Add print for end of training time
    time = str(datetime.now())
    line = 'Training Ended at' + str(time) + ' !!! \n'
    train_loss_log_file.writelines(line)
    train_loss_log_file.flush()

    train_loss_log_file.close()
    test_loss_log_file.close()
示例#5
0
                              shuffle=True)
dataloader_valid = DataLoader(valid_dataset,
                              batch_size=batch_size,
                              shuffle=True)

for epoch in range(50):
    train_loss = []
    valid_loss = []
    for data in tqdm(dataloader_train):
        img = data[0]
        img = img.view(img.shape[0], -1)
        output = model(img)
        loss = criterion(output, img)
        train_loss.append(loss.item())
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    for data in tqdm(dataloader_valid):
        img = data[0]
        img = img.view(img.shape[0], -1)
        output = model(img)
        loss = criterion(output, img)
        valid_loss.append(loss.item())
    print(f"epoch: {epoch}, Train Loss: {np.mean(train_loss)}")
    print(f"epoch: {epoch}, Valid Loss: {np.mean(valid_loss)}")
    writer.add_scalar("Train loss", np.mean(train_loss), epoch)
    writer.add_scalar("Valid loss", np.mean(valid_loss), epoch)

torch.save(model.state_dict(), './autoencoder.pth')
示例#6
0
        for i, data in enumerate(val_dataloader, 1):
            partial_input, coarse_gt, dense_gt = data

            partial_input = partial_input.to(DEVICE)
            coarse_gt = coarse_gt.to(DEVICE)
            dense_gt = dense_gt.to(DEVICE)
            partial_input = partial_input.permute(0, 2, 1)

            v, y_coarse, y_detail = network(partial_input)

            y_coarse = y_coarse.permute(0, 2, 1)
            y_detail = y_detail.permute(0, 2, 1)

            loss = loss_d1(coarse_gt, y_coarse) + args.alpha * loss_d2(
                dense_gt, y_detail)
            total_loss += loss.item()
            iter_count += 1

        mean_loss = total_loss / iter_count
        print("\033[31mValidation epoch {}/{}, loss is {}\033[0m".format(
            epoch, args.epochs, mean_loss))

        # records the best model and epoch
        if mean_loss < minimum_loss:
            best_epoch = epoch
            minimum_loss = mean_loss
            torch.save(network.state_dict(), args.log_dir + '/lowest_loss.pth')

    print("\033[31mBest model (lowest loss) in epoch {}\033[0m".format(
        best_epoch))
示例#7
0
    # Load data
    data_loader_train = load_data(args.data_dir, args.batch_size)

    lowest_loss = float("inf")
    history_train_loss = []

    try:
        for epoch in range(args.epochs):
            t0 = time.time()
            train_loss = train(model, data_loader_train, args.device)
            print("\nTraining Epoch: %d, Train Loss: %.4f, Elapsed: %.1fs" %
                  (epoch + 1, train_loss, time.time() - t0))

            history_train_loss.append(train_loss)

            if train_loss < lowest_loss:
                torch.save(model.state_dict(), 'weights_unsup.pth')
                print("Weight Saved")
                lowest_loss = train_loss
            exp_lr_scheduler.step(train_loss)

    except KeyboardInterrupt:
        print('-' * 89)
        print('Exiting from training early')

    plt.plot(history_train_loss, label='Train Loss')
    plt.title('Training Loss')
    plt.legend()
    plt.show()
示例#8
0
def main():
    parser = argparse.ArgumentParser(description='AvatarNet by Pytorch')
    parser.add_argument('--batch_size',
                        '-b',
                        type=int,
                        default=4,
                        help='Number of images in each mini-batch')
    parser.add_argument('--epoch',
                        '-e',
                        type=int,
                        default=2,
                        help='Number of sweeps over the dataset to train')
    parser.add_argument('--patch_size',
                        '-p',
                        type=int,
                        default=5,
                        help='Size of extracted patches from style features')
    parser.add_argument('--alpha',
                        '-a',
                        type=float,
                        default=0.8,
                        help='alpha control the fusion degree')
    parser.add_argument('--lam1',
                        type=float,
                        default=0.01,
                        help='lambda1 for perceptual loss')
    parser.add_argument('--lam2',
                        type=float,
                        default=0.01,
                        help='lambda2 for tv loss')
    parser.add_argument('--gpu',
                        '-g',
                        type=int,
                        default=0,
                        help='GPU ID(nagative value indicate CPU)')
    parser.add_argument('--learning_rate',
                        '-lr',
                        type=float,
                        default=1e-4,
                        help='learning rate for Adam')
    parser.add_argument('--snapshot_interval',
                        type=int,
                        default=10,
                        help='Interval of snapshot to generate image')
    parser.add_argument('--train_content_dir',
                        type=str,
                        default='/data/chen/content',
                        help='content images directory for train')
    parser.add_argument('--train_style_dir',
                        type=str,
                        default='/data/chen/style',
                        help='style images directory for train')
    parser.add_argument('--test_content_dir',
                        type=str,
                        default='/data/chen/content',
                        help='content images directory for test')
    parser.add_argument('--test_style_dir',
                        type=str,
                        default='/data/chen/style',
                        help='style images directory for test')
    parser.add_argument('--save_dir',
                        type=str,
                        default='result',
                        help='save directory for result and loss')
    parser.add_argument('--reuse',
                        default=None,
                        help='model state path to load for reuse')

    args = parser.parse_args()

    # create directory to save
    if not os.path.exists(args.save_dir):
        os.mkdir(args.save_dir)

    loss_dir = f'{args.save_dir}/loss'
    model_state_dir = f'{args.save_dir}/model_state'
    image_dir = f'{args.save_dir}/image'

    if not os.path.exists(loss_dir):
        os.mkdir(loss_dir)
        os.mkdir(model_state_dir)
        os.mkdir(image_dir)

    # set device on GPU if available, else CPU
    if torch.cuda.is_available() and args.gpu >= 0:
        device = torch.device(f'cuda:{args.gpu}')
        print(f'# CUDA available: {torch.cuda.get_device_name(0)}')
    else:
        device = 'cpu'

    print(f'# Minibatch-size: {args.batch_size}')
    print(f'# epoch: {args.epoch}')
    print('')

    # prepare dataset and dataLoader
    train_dataset = PreprocessDataset(args.train_content_dir,
                                      args.train_style_dir)
    test_dataset = PreprocessDataset(args.test_content_dir,
                                     args.test_style_dir)
    iters = len(train_dataset)
    print(f'Length of train image pairs: {iters}')

    train_loader = DataLoader(train_dataset,
                              batch_size=args.batch_size,
                              shuffle=True)
    test_loader = DataLoader(test_dataset,
                             batch_size=args.batch_size,
                             shuffle=False)
    test_iter = iter(test_loader)

    # set model and optimizer
    model = AutoEncoder().to(device)
    if args.reuse is not None:
        model.load_state_dict(torch.load(args.reuse))
    optimizer = Adam(model.parameters(), lr=args.learning_rate)

    # start training
    loss_list = []
    for e in range(1, args.epoch + 1):
        print(f'Start {e} epoch')
        for i, (content, style) in tqdm(enumerate(train_loader, 1)):
            content = content.to(device)
            style = style.to(device)
            loss = model(content, style, args.patch_size, args.alpha,
                         args.lam1, args.lam2)
            loss_list.append(loss.item())

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            print(
                f'[{e}/total {args.epoch} epoch],[{i} /'
                f'total {round(iters/args.batch_size)} iteration]: {loss.item()}'
            )

            if i % args.snapshot_interval == 0:
                content, style = next(test_iter)
                content = content.to(device)
                style = style.to(device)
                with torch.no_grad():
                    out = model.generate(content, style, args.patch_size,
                                         args.alpha)
                content = denorm(content, device)
                style = denorm(style, device)
                out = denorm(out, device)
                res = torch.cat([content, style, out], dim=0)
                res = res.to('cpu')
                save_image(res,
                           f'{image_dir}/{e}_epoch_{i}_iteration.png',
                           nrow=args.batch_size)
        torch.save(model.state_dict(), f'{model_state_dir}/{e}_epoch.pth')
    plt.plot(range(len(loss_list)), loss_list)
    plt.xlabel('iteration')
    plt.ylabel('loss')
    plt.title('train loss')
    plt.savefig(f'{loss_dir}/train_loss.png')
    with open(f'{loss_dir}/loss_log.txt', 'w') as f:
        for l in loss_list:
            f.write(f'{l}\n')
    print(f'Loss saved in {loss_dir}')
示例#9
0
class Trainer(object):
    def __init__(self, train_loader, test_loader, config):
        self.train_loader = train_loader
        self.test_loader = test_loader
        self.config = config
        self.device = torch.device(
            "cuda:0" if torch.cuda.is_available() else "cpu")

        self.num_epochs = config.num_epochs
        self.lr = config.lr

        self.in_channel = config.in_channel
        self.image_size = config.image_size
        self.hidden_dim = config.hidden_dim
        self.output_dim = config.output_dim

        self.log_interval = config.log_interval
        self.sample_interval = config.sample_interval
        self.ckpt_interval = config.ckpt_interval

        self.sample_folder = config.sample_folder
        self.ckpt_folder = config.ckpt_folder

        self.build_net()
        self.vis = Visualizer()

    def build_net(self):
        # define network
        self.net = AutoEncoder(self.in_channel, self.image_size,
                               self.hidden_dim, self.output_dim)

        if self.config.mode == 'test' and self.config.training_path == '':
            print("[*] Enter model path!")
            exit()

        # if training model exists
        if self.config.training_path != '':
            self.net.load_state_dict(
                torch.load(self.config.training_path,
                           map_location=lambda storage, loc: storage))
            print("[*] Load weight from {}!".format(self.config.training_path))

        self.net.to(self.device)

    # add noise to image
    def add_noise(self, imgs):
        noise = torch.randn(imgs.size()) * 0.4
        noisy_imgs = noise + imgs
        return noisy_imgs

    def train(self):
        # define loss function
        bce_criterion = nn.BCELoss().to(self.device)
        mse_criterion = nn.MSELoss().to(self.device)

        # define optimizer
        optimizer = Adam(self.net.parameters(), self.lr)

        step = 0
        print("[*] Learning started!")

        # get fixed sample
        temp_iter = iter(self.train_loader)
        fixed_imgs, _ = next(temp_iter)
        fixed_imgs = fixed_imgs.to(self.device)

        # save fixed sample image
        x_path = os.path.join(self.sample_folder, 'fixed_input.png')
        save_image(fixed_imgs, x_path, normalize=True)
        print("[*] Save fixed input image!")

        # make fixed noisy sample and save
        fixed_noisy_imgs = self.add_noise(fixed_imgs)
        noisy_x_path = os.path.join(self.sample_folder,
                                    'fixed_noisy_input.png')
        save_image(fixed_noisy_imgs, noisy_x_path, normalize=True)
        print("[*] Save fixed noisy input image!")

        # flatten data tensors
        fixed_imgs = fixed_imgs.view(fixed_imgs.size(0), -1)
        fixed_noisy_imgs = fixed_noisy_imgs.view(fixed_imgs.size(0), -1)

        for epoch in range(self.num_epochs):
            for i, (imgs, _) in enumerate(self.train_loader):
                self.net.train()

                imgs = imgs.view(imgs.size(0), -1)  # original images
                noisy_imgs = self.add_noise(imgs)  # add noise
                noisy_imgs = noisy_imgs.to(self.device)

                # forwarding
                outputs = self.net(noisy_imgs)  # use noisy image as input
                bce_loss = bce_criterion(outputs, imgs)
                mse_loss = mse_criterion(outputs, imgs)

                # backwarding
                optimizer.zero_grad()
                bce_loss.backward()  # backward BCE loss
                optimizer.step()

                # do logging
                if (step + 1) % self.log_interval == 0:
                    print("[{}/{}] [{}/{}] BCE loss: {:3f}, MSE loss:{:3f}".
                          format(epoch + 1, self.num_epochs, i + 1,
                                 len(self.train_loader),
                                 bce_loss.item() / len(imgs),
                                 mse_loss.item() / len(imgs)))
                    self.vis.plot("BCE Loss plot", bce_loss.item() / len(imgs))
                    self.vis.plot("MSE Loss plot", mse_loss.item() / len(imgs))

                # do sampling
                if (step + 1) % self.sample_interval == 0:
                    outputs = self.net(fixed_noisy_imgs)
                    x_hat = outputs.cpu().data.view(outputs.size(0), -1,
                                                    self.image_size,
                                                    self.image_size)
                    x_hat_path = os.path.join(
                        self.sample_folder,
                        'output_epoch{}.png'.format(epoch + 1))
                    save_image(x_hat, x_hat_path, normalize=True)

                    print("[*] Save sample images!")

                step += 1

            if (epoch + 1) % self.ckpt_interval == 0:
                ckpt_path = os.path.join(self.ckpt_folder,
                                         'ckpt_epoch{}.pth'.format(epoch + 1))
                torch.save(self.net.state_dict(), ckpt_path)
                print("[*] Checkpoint saved!")

        print("[*] Learning finished!")
        ckpt_path = os.path.join(self.ckpt_folder, 'final_model.pth')
        torch.save(self.net.state_dict(), ckpt_path)
        print("[*] Final weight saved!")
示例#10
0
                loss.backward()
                optimizer.step()

                total_loss += loss.item()
                total += 1

            pbar.update(1)

    train_loss = total_loss / total

    network.eval()
    total_loss = 0
    total = 0
    with torch.no_grad():
        with tqdm(total=test_gen.get_total_samples() / batch_size) as pbar:
            for audios, labels in test_gen.generator():
                if np.min(audios) != 0 and np.max(audios) != 0 and np.min(labels) != 0 and np.max(labels) != 0:
                    audios = audios / 60
                    labels = labels / 60
                    audios = torch.from_numpy(audios).float().cuda()
                    labels = torch.from_numpy(labels).float().cuda()
                    outputs = network.forward(audios)
                    loss = criterion(outputs, labels)
                    total_loss += loss.item()
                    total += 1

    test_loss = total_loss / total

    torch.save(network.state_dict(), "model.pt")

    print("epoch: ", epoch, "train loss: ", train_loss, "test loss: ", test_loss)
示例#11
0
        d_loss = dis_loss(real_dis, validity)
        optimizer_b.zero_grad()
        d_loss.backward(retain_graph=True)
        optimizer_b.step()
    return _loss


print('training for {} steps'.format(args.n_steps))

for epoch in range(args.n_steps):
    # for idx, (images, _) in enumerate(dataloader):
    a = next(itera)
    b = next(iterb)
    images_a = torch.tensor(a, device=device).float()
    images_a = images_a.to(device)

    images_b = torch.tensor(b, device=device).float()
    images_b = images_b.to(device)
    loss_a = train_step(images_a, version='a')
    loss_b = train_step(images_b, version='b')
    to_print = "Epoch[{}/{}] Loss A:{}, Loss B:{}".format(epoch+1, args.n_steps, loss_a.data, loss_b.data)
    if epoch % 1000 == 0:
        print(to_print)
        model_state_dict = model.state_dict()
        torch.save(model_state_dict, '{}/{}.pt'.format(args.saved_dir, args.model_name))
if save:
    model_state_dict = model.state_dict()
    torch.save(model_state_dict, '{}/model.pt'.format(args.saved_dir))
else:
    model.load_state_dict(torch.load('{}/model.pt'.format(args.saved_dir)))
示例#12
0
    def train(self, config):
        """Training routine"""
        # Initialize datasets for both training and validation
        train_data = torchvision.datasets.ImageFolder(
            root=os.path.join(config.data_dir, "train"),
            transform=torchvision.transforms.ToTensor())
        valid_data = torchvision.datasets.ImageFolder(
            root=os.path.join(config.data_dir, "valid"),
            transform=torchvision.transforms.ToTensor())

        # Create data loader for training and validation.
        tr_data_loader = torch.utils.data.DataLoader(
            dataset=train_data,
            batch_size=config.batch_size,
            num_workers=config.numWorker,
            shuffle=True)
        va_data_loader = torch.utils.data.DataLoader(
            dataset=valid_data,
            batch_size=config.batch_size,
            num_workers=config.numWorker,
            shuffle=False)

        # Create model instance.
        #model = Model()
        model = AutoEncoder()

        # Move model to gpu if cuda is available
        if torch.cuda.is_available():
            model = model.cuda()
        # Make sure that the model is set for training
        model.train()

        # Create loss objects
        data_loss = nn.MSELoss()

        # Create optimizier
        optimizer = optim.Adam(model.parameters(), lr=config.learn_rate)
        # No need to move the optimizer (as of PyTorch 1.0), it lies in the same
        # space as the model

        # Create summary writer
        tr_writer = SummaryWriter(
            log_dir=os.path.join(config.log_dir, "train"))
        va_writer = SummaryWriter(
            log_dir=os.path.join(config.log_dir, "valid"))

        # Create log directory and save directory if it does not exist
        if not os.path.exists(config.log_dir):
            os.makedirs(config.log_dir)
        if not os.path.exists(config.save_dir):
            os.makedirs(config.save_dir)

        # Initialize training
        iter_idx = -1  # make counter start at zero
        best_va_acc = 0  # to check if best validation accuracy
        # Prepare checkpoint file and model file to save and load from
        checkpoint_file = os.path.join(config.save_dir, "checkpoint.pth")
        bestmodel_file = os.path.join(config.save_dir, "best_model.pth")

        # Check for existing training results. If it existst, and the configuration
        # is set to resume `config.resume==True`, resume from previous training. If
        # not, delete existing checkpoint.
        if os.path.exists(checkpoint_file):
            if config.resume:
                # Use `torch.load` to load the checkpoint file and the load the
                # things that are required to continue training. For the model and
                # the optimizer, use `load_state_dict`. It's actually a good idea
                # to code the saving part first and then code this part.
                print("Checkpoint found! Resuming")  # TODO proper logging
                # Read checkpoint file.

                # Fix gpu -> cpu bug
                compute_device = 'cuda' if torch.cuda.is_available() else 'cpu'
                load_res = torch.load(checkpoint_file,
                                      map_location=compute_device)

                # Resume iterations
                iter_idx = load_res["iter_idx"]
                # Resume best va result
                best_va_acc = load_res["best_va_acc"]
                # Resume model
                model.load_state_dict(load_res["model"])

                # Resume optimizer
                optimizer.load_state_dict(load_res["optimizer"])
                # Note that we do not resume the epoch, since we will never be able
                # to properly recover the shuffling, unless we remember the random
                # seed, for example. For simplicity, we will simply ignore this,
                # and run `config.num_epoch` epochs regardless of resuming.
            else:
                os.remove(checkpoint_file)

        # Training loop
        for epoch in range(config.num_epoch):
            # For each iteration
            prefix = "Training Epoch {:3d}: ".format(epoch)

            for data in tqdm(tr_data_loader, desc=prefix):
                # Counter
                iter_idx += 1

                # Split the data
                # x is img, y is label
                x, y = data
                #print(x)
                # Send data to GPU if we have one
                if torch.cuda.is_available():
                    x = x.cuda()
                    y = y.cuda()

                # Apply the model to obtain scores (forward pass)
                logits = model.forward(x)
                # Compute the loss
                loss = data_loss(logits, x.float())
                # Compute gradients
                loss.backward()
                # Update parameters
                optimizer.step()
                # Zero the parameter gradients in the optimizer
                optimizer.zero_grad()

                # Monitor results every report interval
                if iter_idx % config.rep_intv == 0:
                    # Compute accuracy (No gradients required). We'll wrapp this
                    # part so that we prevent torch from computing gradients.
                    with torch.no_grad():
                        pred = torch.argmax(logits, dim=1)
                        acc = torch.mean(
                            torch.eq(pred.view(x.size()), x).float()) * 100.0
                    # Write loss and accuracy to tensorboard, using keywords `loss`
                    # and `accuracy`.
                    tr_writer.add_scalar("loss", loss, global_step=iter_idx)
                    tr_writer.add_scalar("accuracy", acc, global_step=iter_idx)

                    # Save
                    torch.save(
                        {
                            "iter_idx": iter_idx,
                            "best_va_acc": best_va_acc,
                            "model": model.state_dict(),
                            "optimizer": optimizer.state_dict(),
                            "loss": loss,
                            "epoch": epoch,
                            "acc": acc
                        }, checkpoint_file)

                # Validate results every validation interval
                if iter_idx % config.val_intv == 0:
                    # List to contain all losses and accuracies for all the
                    # training batches
                    va_loss = []
                    va_acc = []
                    # Set model for evaluation
                    model = model.eval()
                    for data in va_data_loader:

                        # Split the data
                        x, y = data

                        # Send data to GPU if we have one
                        if torch.cuda.is_available():
                            x = x.cuda()
                            y = y.cuda()

                        # Apply forward pass to compute the losses
                        # and accuracies for each of the validation batches
                        with torch.no_grad():
                            # Compute logits
                            logits = model.forward(x)
                            # Compute loss and store as numpy
                            loss = data_loss(logits, x.float())
                            va_loss += [loss.cpu().numpy()]
                            # Compute accuracy and store as numpy
                            pred = torch.argmax(logits, dim=1)
                            acc = torch.mean(
                                torch.eq(pred.view(x.size()),
                                         x).float()) * 100.0
                            va_acc += [acc.cpu().numpy()]
                    # Set model back for training
                    model = model.train()
                    # Take average
                    va_loss = np.mean(va_loss)
                    va_acc = np.mean(va_acc)

                    # Write to tensorboard using `va_writer`
                    va_writer.add_scalar("loss", va_loss, global_step=iter_idx)
                    va_writer.add_scalar("accuracy",
                                         va_acc,
                                         global_step=iter_idx)
                    # Check if best accuracy
                    if va_acc > best_va_acc:
                        best_va_acc = va_acc
                        # Save best model using torch.save. Similar to previous
                        # save but at location defined by `bestmodel_file`
                        torch.save(
                            {
                                "iter_idx": iter_idx,
                                "best_va_acc": best_va_acc,
                                "model": model.state_dict(),
                                "optimizer": optimizer.state_dict(),
                                "loss": loss,
                                "acc": acc
                            }, bestmodel_file)