Exemplo n.º 1
0
def main():
    dataset = Dataset()
    dataset.read_name_data()
    rnn = RNN(dataset.n_letters, n_hidden, dataset.n_categories)

    start = time.time()
    current_loss = 0
    for iter in range(1, n_iters + 1):
        category, line, category_tensor, line_tensor = \
            randomTrainingExample(dataset.all_categories,
                                  dataset.category_lines)
        output, loss = train(rnn, category_tensor, line_tensor)
        current_loss += loss

        # Print iter number, loss, name and guess
        if iter % print_every == 0:
            guess, guess_i = categoryFromOutput(output, dataset)
            correct = '✓' if guess == category else '✗ (%s)' % category
            print('%d %d%% (%s) %.4f %s / %s %s' %
                  (iter, iter / n_iters * 100, timeSince(start), loss, line,
                   guess, correct))

        # Add current loss avg to list of losses
        if iter % plot_every == 0:
            all_losses.append(current_loss / plot_every)
            current_loss = 0

    # Plotting the historical loss
    plt.figure()
    plt.plot(all_losses)
    plt.title("The negative log likelihood(NLL) loss per iter")
    plt.xlabel("n_iter")
    plt.ylabel("NLL loss")
    plt.show()

    # Evaluate the trained RNN.
    confusion = torch.zeros(dataset.n_categories, dataset.n_categories)
    n_confusion = 10000

    for i in range(n_confusion):
        category, line, category_tensor, line_tensor = \
            randomTrainingExample(dataset.all_categories,
                                  dataset.category_lines)

        output = evalute(rnn, line_tensor)
        guess, guess_i = categoryFromOutput(output, dataset)
        category_i = dataset.all_categories.index(category)
        confusion[category_i][guess_i] += 1

    for i in range(dataset.n_categories):
        confusion[i] = confusion[i] / confusion[i].sum()

    # Set up plot
    fig = plt.figure()
    ax = fig.add_subplot(111)
    cax = ax.matshow(confusion.numpy())
    fig.colorbar(cax)

    # Set up axes
    ax.set_xticklabels([''] + dataset.all_categories, rotation=90)
    ax.set_yticklabels([''] + dataset.all_categories)

    # Force label at every tick
    ax.xaxis.set_major_locator(ticker.MultipleLocator(1))
    ax.yaxis.set_major_locator(ticker.MultipleLocator(1))

    # sphinx_gallery_thumbnail_number = 2
    plt.show()
Exemplo n.º 2
0
def main():

    writer = SummaryWriter()

    timestamp = datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S')
    log_dirpath = "./log_" + timestamp
    os.mkdir(log_dirpath)

    handlers = [logging.FileHandler(
        log_dirpath + "/deep_lpf.log"), logging.StreamHandler()]
    logging.basicConfig(
        level=logging.INFO, format='%(asctime)s %(levelname)s %(message)s', handlers=handlers)

    parser = argparse.ArgumentParser(
        description="Train the DeepLPF neural network on image pairs")

    parser.add_argument(
        "--num_epoch", type=int, required=False, help="Number of epoches (default 5000)", default=100000)
    parser.add_argument(
        "--valid_every", type=int, required=False, help="Number of epoches after which to compute validation accuracy",
        default=25)
    parser.add_argument(
        "--checkpoint_filepath", required=False, help="Location of checkpoint file", default=None)
    parser.add_argument(
        "--inference_img_dirpath", required=False,
        help="Directory containing images to run through a saved DeepLPF model instance", default=None)

    args = parser.parse_args()
    num_epoch = args.num_epoch
    valid_every = args.valid_every
    checkpoint_filepath = args.checkpoint_filepath
    inference_img_dirpath = args.inference_img_dirpath

    logging.info('######### Parameters #########')
    logging.info('Number of epochs: ' + str(num_epoch))
    logging.info('Logging directory: ' + str(log_dirpath))
    logging.info('Dump validation accuracy every: ' + str(valid_every))
    logging.info('##############################')

    training_data_loader = Adobe5kDataLoader(data_dirpath="/home/sjm213/adobe5k/adobe5k/",
                                             img_ids_filepath="/home/sjm213/adobe5k/adobe5k/images_train.txt")
    training_data_dict = training_data_loader.load_data()
    training_dataset = Dataset(data_dict=training_data_dict, transform=transforms.Compose(
        [transforms.ToPILImage(), transforms.RandomHorizontalFlip(), transforms.RandomVerticalFlip(),
         transforms.ToTensor()]),
        normaliser=2 ** 8 - 1, is_valid=False)

    validation_data_loader = Adobe5kDataLoader(data_dirpath="/home/sjm213/adobe5k/adobe5k/",
                                               img_ids_filepath="/home/sjm213/adobe5k/adobe5k/images_valid.txt")
    validation_data_dict = validation_data_loader.load_data()
    validation_dataset = Dataset(data_dict=validation_data_dict,
                                 transform=transforms.Compose([transforms.ToTensor()]), normaliser=2 ** 8 - 1,
                                 is_valid=True)

    testing_data_loader = Adobe5kDataLoader(data_dirpath="/home/sjm213/adobe5k/adobe5k/",
                                            img_ids_filepath="/home/sjm213/adobe5k/adobe5k/images_test.txt")
    testing_data_dict = testing_data_loader.load_data()
    testing_dataset = Dataset(data_dict=testing_data_dict,
                              transform=transforms.Compose([transforms.ToTensor()]), normaliser=2 ** 8 - 1,
                              is_valid=True)

    training_data_loader = torch.utils.data.DataLoader(training_dataset, batch_size=1, shuffle=True,
                                                       num_workers=10)
    testing_data_loader = torch.utils.data.DataLoader(testing_dataset, batch_size=1, shuffle=False,
                                                      num_workers=10)
    validation_data_loader = torch.utils.data.DataLoader(validation_dataset, batch_size=1,
                                                         shuffle=False,
                                                         num_workers=10)

    if (checkpoint_filepath is not None) and (inference_img_dirpath is not None):

        inference_data_loader = Adobe5kDataLoader(data_dirpath=inference_img_dirpath,
                                                  img_ids_filepath=inference_img_dirpath+"/images_inference.txt")
        inference_data_dict = inference_data_loader.load_data()
        inference_dataset = Dataset(data_dict=inference_data_dict,
                                    transform=transforms.Compose([transforms.ToTensor()]), normaliser=2 ** 8 - 1,
                                    is_valid=True)

        inference_data_loader = torch.utils.data.DataLoader(inference_dataset, batch_size=1, shuffle=False,
                                                            num_workers=10)

        '''
        Performs inference on all the images in inference_img_dirpath
        '''
        logging.info(
            "Performing inference with images in directory: " + inference_img_dirpath)

        net = torch.load(checkpoint_filepath,
                         map_location=lambda storage, location: storage)

        # switch model to evaluation mode
        net.eval()

        criterion = model.DeepLPFLoss()

        testing_evaluator = metric.Evaluator(
            criterion, inference_data_loader, "test", log_dirpath)

        testing_evaluator.evaluate(net, epoch=0)

    else:

        print(torch.cuda.is_available())
        net = model.DeepLPFNet()
        net.cuda(0)

        logging.info('######### Network created #########')
        logging.info('Architecture:\n' + str(net))

        for name, param in net.named_parameters():
            if param.requires_grad:
                print(name)

        criterion = model.DeepLPFLoss(ssim_window_size=5)

        '''
        The following objects allow for evaluation of a model on the testing and validation splits of a dataset
        '''
        validation_evaluator = metric.Evaluator(
            criterion, validation_data_loader, "valid", log_dirpath)
        testing_evaluator = metric.Evaluator(
            criterion, testing_data_loader, "test", log_dirpath)

        optimizer = optim.Adam(filter(lambda p: p.requires_grad, net.parameters()), lr=1e-4, betas=(0.9, 0.999),
                               eps=1e-08)
        best_valid_psnr = 0.0

        alpha = 0.0
        optimizer.zero_grad()
        net.train()

        running_loss = 0.0
        examples = 0
        psnr_avg = 0.0
        ssim_avg = 0.0
        batch_size = 1
        total_examples = 0

        for epoch in range(num_epoch):

            # Train loss
            examples = 0.0
            running_loss = 0.0
            
            for batch_num, data in enumerate(training_data_loader, 0):

                input_img_batch, output_img_batch, category = Variable(data['input_img'],
                                                                       requires_grad=False).cuda(), Variable(data['output_img'],
                                                                                                             requires_grad=False).cuda(), data[
                    'name']

                start_time = time.time()
                net_output_img_batch = net(
                    input_img_batch)
                net_output_img_batch = torch.clamp(
                    net_output_img_batch, 0.0, 1.0)

                elapsed_time = time.time() - start_time

                loss = criterion(net_output_img_batch, output_img_batch)

                optimizer.zero_grad()
                loss.backward()
                optimizer.step()

                running_loss += loss.data[0]
                examples += batch_size
                total_examples+=batch_size

                writer.add_scalar('Loss/train', loss.data[0], total_examples)

            logging.info('[%d] train loss: %.15f' %
                         (epoch + 1, running_loss / examples))
            writer.add_scalar('Loss/train_smooth', running_loss / examples, epoch + 1)

            # Valid loss
            '''
            examples = 0.0
            running_loss = 0.0

            for batch_num, data in enumerate(validation_data_loader, 0):

                net.eval()

                input_img_batch, output_img_batch, category = Variable(
                    data['input_img'],
                    requires_grad=False).cuda(), Variable(data['output_img'],
                                                         requires_grad=False).cuda(), \
                    data[
                    'name']

                net_output_img_batch = net(
                    input_img_batch)
                net_output_img_batch = torch.clamp(
                    net_output_img_batch, 0.0, 1.0)

                optimizer.zero_grad()

                loss = criterion(net_output_img_batch, output_img_batch)

                running_loss += loss.data[0]
                examples += batch_size
                total_examples+=batch_size

                writer.add_scalar('Loss/train', loss.data[0], total_examples)

            logging.info('[%d] valid loss: %.15f' %
                         (epoch + 1, running_loss / examples))
            writer.add_scalar('Loss/valid_smooth', running_loss / examples, epoch + 1)

            net.train()
            '''

            if (epoch + 1) % valid_every == 0:

                logging.info("Evaluating model on validation and test dataset")

                valid_loss, valid_psnr, valid_ssim = validation_evaluator.evaluate(
                    net, epoch)
                test_loss, test_psnr, test_ssim = testing_evaluator.evaluate(
                    net, epoch)

                # update best validation set psnr
                if valid_psnr > best_valid_psnr:

                    logging.info(
                        "Validation PSNR has increased. Saving the more accurate model to file: " + 'deeplpf_validpsnr_{}_validloss_{}_testpsnr_{}_testloss_{}_epoch_{}_model.pt'.format(valid_psnr,
                                                                                                                                                                                         valid_loss.tolist()[0], test_psnr, test_loss.tolist()[
                                                                                                                                                                                             0],
                                                                                                                                                                                         epoch))

                    best_valid_psnr = valid_psnr
                    snapshot_prefix = os.path.join(
                        log_dirpath, 'deeplpf')
                    snapshot_path = snapshot_prefix + '_validpsnr_{}_validloss_{}_testpsnr_{}_testloss_{}_epoch_{}_model.pt'.format(valid_psnr,
                                                                                                                                    valid_loss.tolist()[
                                                                                                                                        0],
                                                                                                                                    test_psnr, test_loss.tolist()[
                                                                                                                                        0],
                                                                                                                                    epoch)
                    torch.save(net.state_dict(), snapshot_path)

                net.train()

        '''
        Run the network over the testing dataset split
        '''
        testing_evaluator.evaluate(net, epoch=0)

        snapshot_prefix = os.path.join(log_dirpath, 'deep_lpf')
        snapshot_path = snapshot_prefix + "_" + str(num_epoch)
        torch.save(net.state_dict(), snapshot_path)
Exemplo n.º 3
0
def train(args):

    use_tarantella = eval(args['training']['use_tarantella'])
    ndims_tot = np.prod(eval(args['data']['data_dimensions']))
    output_dir = args['checkpoints']['output_dir']
    sched_milestones = eval(args['training']['milestones_lr_decay'])
    n_epochs = eval(args['training']['N_epochs'])
    optimizer_kwargs = eval(args['training']['optimizer_kwargs'])
    optimizer_type = args['training']['optimizer']
    optimizer_lr = eval(args['training']['lr'])

    if use_tarantella:
        import tarantella
        # no argument (otherwise: ranks per node)
        tarantella.init()
        node_rank = tarantella.get_rank()
        nodes_number = tarantella.get_size()
    else:
        node_rank = 0
        nodes_number = 1
    is_primary_node = (node_rank == 0)

    args['training']['rank'] = repr(node_rank)
    args['training']['comm_size'] = repr(nodes_number)

    model = build_model(args)
    data = Dataset(args)

    print(f'NODE_RANK {node_rank}')
    print(f'N_NODES {nodes_number}')
    print(f'NODE_RANK {str(is_primary_node).upper()}', flush=True)

    def nll_loss_z_part(y, z):
        zz = tf.math.reduce_mean(z**2)
        return 0.5 * zz

    def nll_loss_jac_part(y, jac):
        return -tf.math.reduce_mean(jac) / ndims_tot

    def lr_sched(ep, lr):
        if ep in sched_milestones:
            return 0.1 * lr
        return lr

    # TODO: should this only be for one node, or for each?
    lr_scheduler_callback = kr.callbacks.LearningRateScheduler(
        lr_sched, verbose=is_primary_node)

    callbacks = [lr_scheduler_callback, kr.callbacks.TerminateOnNaN()]

    if is_primary_node:
        #checkpoint_callback = kr.callbacks.ModelCheckpoint(filepath=os.path.join(output_dir, 'checkpoint_best.hdf5'),
        #save_best_only=True,
        #save_weights_only=True,
        #mode='min',
        #verbose=is_primary_node)

        loss_log_callback = kr.callbacks.CSVLogger(os.path.join(
            output_dir, 'losses.dat'),
                                                   separator=' ')

        #callbacks.append(checkpoint_callback)
        callbacks.append(loss_log_callback)

    try:
        optimizer_type = {
            'ADAM': kr.optimizers.Adam,
            'SGD': kr.optimizers.SGD
        }[optimizer_type]
    except KeyError:
        optimizer_type = eval(optimizer_type)

    optimizer = optimizer_type(optimizer_lr, **optimizer_kwargs)

    if use_tarantella:
        model = tarantella.Model(model)

    model.compile(loss=[nll_loss_z_part, nll_loss_jac_part],
                  optimizer=optimizer,
                  run_eagerly=False)
    model.build((128, 32, 32, 3))

    try:
        history = model.fit(
            data.train_dataset,
            epochs=n_epochs,
            verbose=is_primary_node,
            callbacks=callbacks,
            validation_data=(data.test_dataset if is_primary_node else None))
    except:
        raise
Exemplo n.º 4
0
 def setUp(self):
     self.fields = {'source': Field(), 'target': Field()}
     self.examples = {'source': [1, 2, 3], 'target': [1, 2, 3]}
     self.dataset = Dataset(self.examples, self.fields)
def main(args):
    dataset = Dataset(args)
    os.makedirs(args.save_dir, exist_ok=True)
    with open(os.path.join(args.save_dir, 'dataset_info'), 'wb') as wf:
        pickle.dump(dataset.dataset_info, wf)
    if args.task == 'rhyme':
        with open(os.path.join(args.save_dir, 'rhyme_info'), 'wb') as wf:
            pickle.dump(dataset.rhyme_info, wf)
    if args.ckpt:
        checkpoint = torch.load(args.ckpt, map_location=args.device)
        start_epoch = checkpoint['epoch'] + 1
        best_val_metric = checkpoint['best_metric']
        model_args = checkpoint['args']
        model = Model(
            model_args,
            dataset.gpt_pad_id,
            len(dataset.index2word),
            rhyme_group_size=len(dataset.index2rhyme_group)
            if args.task == 'rhyme' else None
        )  # no need to get the glove embeddings when reloading since they're saved in model ckpt anyway
        model.load_state_dict(checkpoint['state_dict'])
        model = model.to(args.device)
        optimizer = torch.optim.Adam(model.parameters(), lr=model_args.lr)
        optimizer.load_state_dict(checkpoint['optimizer'])
        data_start_index = checkpoint['data_start_index']
        print("=> loaded checkpoint '{}' (epoch {})".format(
            args.ckpt, checkpoint['epoch']))
        # NOTE: just import pdb after loading the model here if you want to play with it, it's easy
        # model.eval()
        # import pdb; pdb.set_trace()
    else:
        model = Model(args,
                      dataset.gpt_pad_id,
                      len(dataset.index2word),
                      rhyme_group_size=len(dataset.index2rhyme_group)
                      if args.task == 'rhyme' else None,
                      glove_embeddings=dataset.glove_embeddings)
        model = model.to(args.device)
        optimizer = torch.optim.Adam(model.parameters(), lr=args.lr)
        best_val_metric = 1e8  # lower is better for BCE
        data_start_index = 0
    print('num params', num_params(model))
    criterion = nn.BCEWithLogitsLoss().to(args.device)

    if args.evaluate:
        epoch = 0
        validate(model, dataset, criterion, epoch, args)
        return
    for epoch in range(args.epochs):
        print("TRAINING: Epoch {} at {}".format(epoch, time.ctime()))
        data_start_index = train(model, dataset, optimizer, criterion, epoch,
                                 args, data_start_index)
        if epoch % args.validation_freq == 0:
            print("VALIDATION: Epoch {} at {}".format(epoch, time.ctime()))
            metric = validate(model, dataset, criterion, epoch, args)

            if not args.debug:
                if metric < best_val_metric:
                    print('new best val metric', metric)
                    best_val_metric = metric
                    save_checkpoint(
                        {
                            'epoch': epoch,
                            'state_dict': model.state_dict(),
                            'best_metric': best_val_metric,
                            'optimizer': optimizer.state_dict(),
                            'data_start_index': data_start_index,
                            'args': args
                        }, os.path.join(args.save_dir, 'model_best.pth.tar'))
                save_checkpoint(
                    {
                        'epoch': epoch,
                        'state_dict': model.state_dict(),
                        'best_metric': metric,
                        'optimizer': optimizer.state_dict(),
                        'data_start_index': data_start_index,
                        'args': args
                    },
                    os.path.join(args.save_dir,
                                 'model_epoch' + str(epoch) + '.pth.tar'))
Exemplo n.º 6
0
Arquivo: main.py Projeto: hfxunlp/ATR
def train(model, loss, params):
    train_data = Dataset(
        os.path.join(params.data_dir, "train.txt"),
        params.vocab,
    )

    total_loss = 0.
    global_step = 0
    start_time = time.time()
    best_valid_loss = None
    hidden = model.init_hidden(params.batch_size)
    lrate = params.lr
    for epoch in range(params.epochs):
        for bidx, batch in enumerate(
                train_data.batcher(
                    params.batch_size,
                    params.num_steps,
                )):
            model.train()

            x, t = batch

            x = x.to(params.device)
            t = t.to(params.device)

            hidden = repackage_hidden(hidden)
            model.zero_grad()
            logits, hidden = model(x, state=hidden)
            gloss = loss(logits.view(-1, logits.size(-1)), t.view(-1))

            gloss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), params.clip)

            for p in model.parameters():
                p.data.add_(-lrate, p.grad.data)

            total_loss += gloss.item()

            if global_step > 0 \
                    and global_step % params.disp_freq == 0:
                sub_loss = total_loss / params.disp_freq
                duration = time.time() - start_time
                print('| Train | epoch {:3d} | {:5d} batches | '
                      'lr {:.5f} | ms/batch {:5.2f} | '
                      'loss {:5.2f} | ppl {:8.2f}'.format(
                          epoch, bidx, lrate,
                          duration * 1000 / params.disp_freq, sub_loss,
                          math.exp(sub_loss)))
                total_loss = 0.
                start_time = time.time()

            global_step += 1

        # start evaluation
        # keep the batch_size as default, since we do not need so
        # accurate batch_size
        score, speed = eval(model, loss,
                            os.path.join(params.data_dir, 'dev.txt'), params)
        print('|  Dev  | epoch {:3d} | ms/batch {:5.2f} | '
              'loss {:5.2f} | ppl {:8.2f}'.format(epoch, speed, score,
                                                  math.exp(score)))

        if not best_valid_loss or score < best_valid_loss:
            best_valid_loss = score
            with open(params.save, 'wb') as f:
                torch.save(model, f)
        else:
            lrate /= 4.0
Exemplo n.º 7
0
from data import Dataset
from network import Network
import os
import datetime, re

batchsize = 10

#visualize_after = 1000
visualize_image_after = 2000 / batchsize

#matrix = np.random.rand(matrix_h, matrix_w, 3) * (matrix_max_xy - matrix_min_xy) + matrix_min_xy

image_filename = "woman.png"
#image_filename = "parrot.png"
#image_filename = "parrot.png"
dataset = Dataset(image_filename)

#alpha = 0.99
alpha = 0.991

strmost = 1
strmost_increase_after = 25000 * 4
strmost_increase_until = strmost_increase_after * 2
strmost_final = 30

step = 1

# Create logdir name
logdir = "logs/{}-{}".format(
    os.path.basename(__file__),
    datetime.datetime.now().strftime("%Y-%m-%d_%H%M%S"))
Exemplo n.º 8
0
if __name__ == "__main__":
    print('test bpe tok ...')
    bpe_tok = src_tok = Tokenizer(
        'en',
        ['bpe:/pvc/minNMT/data/wmt14.en-de/bpe.37000.h100000/bpe.37000.share'],
        '/pvc/minNMT/data/wmt14.en-de/bpe.37000.h100000/bpe.37000.share.vocab')
    trg_tok = Tokenizer(
        'de',
        ['bpe:/pvc/minNMT/data/wmt14.en-de/bpe.37000.h100000/bpe.37000.share'],
        '/pvc/minNMT/data/wmt14.en-de/bpe.37000.h100000/bpe.37000.share.vocab')

    print("prepare model ...")
    gnmt = GNMT(src_tok, trg_tok, 512, 0.1).cuda()

    print('setup dataset ...')
    dataset = Dataset(src_tok, trg_tok,
                      '/pvc/minNMT/data/wmt14.en-de/bpe.37000.h100000')
    dataset.setup()
    print(f'done.')

    tensorboard = SummaryWriter()
    train_tqdm = tqdm(dataset.train_dataloader(20000))
    for b in train_tqdm:

        def closure():
            gnmt.zero_grad()
            loss, acc = gnmt.train_step(b)
            loss.backward()
            train_tqdm.set_postfix({'loss': loss.item(), 'acc': acc.item()})
            tensorboard.add_scalar('train/loss', loss.item(), train_tqdm.n)
            tensorboard.add_scalar('train/acc', acc.item(), train_tqdm.n)
            return loss
Exemplo n.º 9
0
def main(args):
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    train_data = Dataset(args.train_file)
    val_data = Dataset(args.val_file)
    #train_loader = DataLoader(dataset=train_data, shuffle=True)
    #val_loader = DataLoader(dataset=val_data, shuffle=True)
    train_sents = train_data.batch_size.sum()
    vocab_size = int(train_data.vocab_size)
    max_len = max(val_data.sents.size(1), train_data.sents.size(1))
    print('Train: %d sents / %d batches, Val: %d sents / %d batches' %
          (train_data.sents.size(0), len(train_data), val_data.sents.size(0),
           len(val_data)))
    print('Vocab size: %d, Max Sent Len: %d' % (vocab_size, max_len))
    print('Save Path', args.save_path)
    #cuda.set_device(args.gpu)
    model = GeneralCompPCFG(vocab=vocab_size,
                            state_dim=args.state_dim,
                            t_states=args.t_states,
                            nt_states=args.nt_states,
                            h_dim=args.h_dim,
                            w_dim=args.w_dim,
                            z_dim=args.z_dim,
                            prior=args.prior,
                            vpost=args.vpost)
    # model parallelize
    base_gpu = torch.device('cuda:0')
    model.to(base_gpu)
    model = BetterDataParallel(model)

    for name, param in model.named_parameters():
        if param.dim() > 1:
            xavier_uniform_(param)
    print("model architecture")
    print(model)
    model.train()
    # model.cuda()
    optimizer = torch.optim.Adam(model.parameters(),
                                 lr=args.lr,
                                 betas=(args.beta1, args.beta2))
    best_val_ppl = 1e5
    best_val_f1 = 0
    epoch = 0
    num_gpus = torch.cuda.device_count()
    while epoch < args.num_epochs:
        start_time = time.time()
        epoch += 1
        print('Starting epoch %d' % epoch)
        train_nll = 0.
        train_kl = 0.
        num_sents = 0.
        num_words = 0.
        all_stats = [[0., 0., 0.]]
        b = 0
        n_gpus = torch.cuda.device_count()
        for i in np.random.permutation(len(train_data)):
            b += 1
            sents, length, batch_size, _, gold_spans, gold_binary_trees, _ = train_data[
                i]
            if length > args.max_length or length == 1:  #length filter based on curriculum
                continue
            with open("batchsize.log", 'a') as fp:
                fp.write(str(batch_size) + "\n")
            if batch_size == 0 or batch_size % n_gpus != 0:
                continue
            #if batch_size == 0 or batch_size % num_gpus != 0:   #gpu paraellization filter
            #  continue
            # sents = sents.cuda()
            #sents = sents.to(base_gpu)
            optimizer.zero_grad()
            nll, kl, binary_matrix, argmax_spans = model(sents, argmax=True)
            (nll + kl).mean().backward()
            train_nll += nll.sum().item()
            train_kl += kl.sum().item()
            torch.nn.utils.clip_grad_norm_(model.parameters(),
                                           args.max_grad_norm)
            optimizer.step()
            num_sents += batch_size
            num_words += batch_size * (
                length + 1
            )  # we implicitly generate </s> so we explicitly count it
            for bb in range(batch_size):
                span_b = [(a[0].item(), a[1].item())
                          for a in argmax_spans[bb]]  #ignore labels
                span_b_set = set(span_b[:-1])
                update_stats(span_b_set, [set(gold_spans[bb][:-1])], all_stats)
            if b % args.print_every == 0:
                all_f1 = get_f1(all_stats)
                param_norm = sum([p.norm()**2
                                  for p in model.parameters()]).item()**0.5
                gparam_norm = sum([
                    p.grad.norm()**2 for p in model.parameters()
                    if p.grad is not None
                ]).item()**0.5
                log_str = 'Epoch: %d, Batch: %d/%d, |Param|: %.6f, |GParam|: %.2f,  LR: %.4f, ' + \
                          'ReconPPL: %.2f, KL: %.4f, PPLBound: %.2f, ValPPL: %.2f, ValF1: %.2f, ' + \
                          'CorpusF1: %.2f, Throughput: %.2f examples/sec'
                print(log_str %
                      (epoch, b, len(train_data), param_norm, gparam_norm,
                       args.lr, np.exp(train_nll / num_words), train_kl /
                       num_sents, np.exp((train_nll + train_kl) / num_words),
                       best_val_ppl, best_val_f1, all_f1[0], num_sents /
                       (time.time() - start_time)))
                # print an example parse
                tree = get_tree_from_binary_matrix(binary_matrix[0], length)
                action = get_actions(tree)
                sent_str = [
                    train_data.idx2word[word_idx]
                    for word_idx in list(sents[0].cpu().numpy())
                ]
                print("Pred Tree: %s" % get_tree(action, sent_str))
                print("Gold Tree: %s" %
                      get_tree(gold_binary_trees[0], sent_str))
                with open("dummy_output.log", 'a') as fp:
                    fp.write(
                        log_str %
                        (epoch, b, len(train_data), param_norm, gparam_norm,
                         args.lr, np.exp(train_nll / num_words), train_kl /
                         num_sents, np.exp((train_nll + train_kl) / num_words),
                         best_val_ppl, best_val_f1, all_f1[0], num_sents /
                         (time.time() - start_time)) + "\n")
                    e = datetime.datetime.now()
                    fp.write("The time is now: = %s:%s:%s" %
                             (e.hour, e.minute, e.second) + "\n")
            with open("gpu_stats.log", "a") as fp:
                for i in range(torch.cuda.device_count()):
                    fp.write(
                        "GPU %d: %d\n" %
                        (i, torch.cuda.max_memory_allocated("cuda:" + str(i))))
                fp.write("\n")

        args.max_length = min(args.final_max_length,
                              args.max_length + args.len_incr)
        print('--------------------------------')
        print('Checking validation perf...')
        val_ppl, val_f1 = eval(val_data, model)
        print('--------------------------------')

        if val_ppl < best_val_ppl:
            best_val_ppl = val_ppl
            best_val_f1 = val_f1
            checkpoint = {
                'args': args.__dict__,
                'model': model.cpu(),
                'word2idx': train_data.word2idx,
                'idx2word': train_data.idx2word
            }
            print('Saving checkpoint to %s' % args.save_path)
            torch.save(checkpoint, args.save_path)
            # model.cuda()
            model.to(base_gpu)
Exemplo n.º 10
0
                    type=int,
                    help="whether load pretrained model")
parser.add_argument('--gpu_index',
                    default='0',
                    type=str,
                    help="whether load pretrained model")
FLAGS, _ = parser.parse_known_args()
log('Settings')
utils.showFLAGS(FLAGS)

#%% set logger
logger = LOGGER(FLAGS)
log('Create Logger Successfully')

#%% set train data
dataset = Dataset(FLAGS, logger)
n_images = len(dataset.train_names)
inputsize = dataset.input_size

#%% create model
os.environ['CUDA_VISIBLE_DEVICES'] = FLAGS.gpu_index
img_ch = 1
img_height, img_width = inputsize[0], inputsize[1]
inputs = Input((img_height, img_width, img_ch))
x = Conv2D(64, kernel_size=(9, 9), padding='valid')(inputs)
x = Activation('relu')(x)
x = Conv2D(32, kernel_size=(1, 1), padding='valid')(x)
x = Activation('relu')(x)
x = Conv2D(1, kernel_size=(5, 5), padding='valid')(x)
x = Activation('relu')(x)
SRCNN = Model(inputs=inputs, outputs=x, name='SRCNN')
Exemplo n.º 11
0
        cls_file = os.path.join(label_path, id + "_cls.npy")
        mask_file = os.path.join(label_path, id + "_nd.npy")
        copyfile(cls_source, cls_file)
        copyfile(mask_source, mask_file)


if __name__ == '__main__':
    # train on the GPU or on the CPU, if a GPU is not available
    device = torch.device(
        'cuda') if torch.cuda.is_available() else torch.device('cpu')

    # our dataset has three classes only - background, non-damaged, and damaged
    num_classes = 6  # 3 or 6

    dataset_test = Dataset("./datasets/Eureka_infer/102/",
                           "./datasets/Eureka_infer/102_labels/",
                           get_transform(train=False),
                           readsave=False)

    data_loader_test = torch.utils.data.DataLoader(dataset_test,
                                                   batch_size=1,
                                                   shuffle=False,
                                                   num_workers=2,
                                                   collate_fn=utils.collate_fn)

    mask_rcnn = get_model_instance_segmentation(num_classes,
                                                image_mean=None,
                                                image_std=None,
                                                stats=False)

    mask_rcnn.load_state_dict(
        torch.load("trained_param_eureka_aug_mult/epoch_0021.param"))
Exemplo n.º 12
0
def main(args):
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    train_data = Dataset(args.train_file)
    val_data = Dataset(args.val_file)
    test_data = Dataset(args.test_file)
    train_sents = train_data.batch_size.sum()
    vocab_size = int(train_data.vocab_size)
    logger.info('Train data: %d batches' % len(train_data))
    logger.info('Val data: %d batches' % len(val_data))
    logger.info('Test data: %d batches' % len(test_data))
    logger.info('Word vocab size: %d' % vocab_size)

    checkpoint_dir = args.checkpoint_dir
    if not os.path.exists(checkpoint_dir):
        os.makedirs(checkpoint_dir)
    suffix = "%s_%s.pt" % (args.model, 'bl')
    checkpoint_path = os.path.join(checkpoint_dir, suffix)

    if args.slurm == 0:
        cuda.set_device(args.gpu)
    if args.train_from == '':
        model = RNNVAE(vocab_size=vocab_size,
                       enc_word_dim=args.enc_word_dim,
                       enc_h_dim=args.enc_h_dim,
                       enc_num_layers=args.enc_num_layers,
                       dec_word_dim=args.dec_word_dim,
                       dec_h_dim=args.dec_h_dim,
                       dec_num_layers=args.dec_num_layers,
                       dec_dropout=args.dec_dropout,
                       latent_dim=args.latent_dim,
                       mode=args.model)
        for param in model.parameters():
            param.data.uniform_(-0.1, 0.1)
    else:
        logger.info('loading model from ' + args.train_from)
        checkpoint = torch.load(args.train_from)
        model = checkpoint['model']

    logger.info("model architecture")
    print(model)

    optimizer = torch.optim.Adam(model.parameters(), lr=args.lr)

    if args.warmup == 0:
        args.beta = 1.
    else:
        args.beta = 0.1

    criterion = nn.NLLLoss()
    model.cuda()
    criterion.cuda()
    model.train()

    def variational_loss(input, sents, model, z=None):
        mean, logvar = input
        z_samples = model._reparameterize(mean, logvar, z)
        preds = model._dec_forward(sents, z_samples)
        nll = sum([
            criterion(preds[:, l], sents[:, l + 1])
            for l in range(preds.size(1))
        ])
        kl = utils.kl_loss_diag(mean, logvar)
        return nll + args.beta * kl

    update_params = list(model.dec.parameters())
    meta_optimizer = OptimN2N(variational_loss,
                              model,
                              update_params,
                              eps=args.eps,
                              lr=[args.svi_lr1, args.svi_lr2],
                              iters=args.svi_steps,
                              momentum=args.momentum,
                              acc_param_grads=args.train_n2n == 1,
                              max_grad_norm=args.svi_max_grad_norm)
    if args.test == 1:
        args.beta = 1
        test_data = Dataset(args.test_file)
        eval(test_data, model, meta_optimizer)
        exit()

    t = 0
    best_val_nll = 1e5
    best_epoch = 0
    val_stats = []
    epoch = 0
    while epoch < args.num_epochs:
        start_time = time.time()
        epoch += 1
        logger.info('Starting epoch %d' % epoch)
        train_nll_vae = 0.
        train_nll_autoreg = 0.
        train_kl_vae = 0.
        train_nll_svi = 0.
        train_kl_svi = 0.
        train_kl_init_final = 0.
        num_sents = 0
        num_words = 0
        b = 0

        for i in np.random.permutation(len(train_data)):
            if args.warmup > 0:
                args.beta = min(
                    1, args.beta + 1. / (args.warmup * len(train_data)))

            sents, length, batch_size = train_data[i]
            if args.gpu >= 0:
                sents = sents.cuda()
            b += 1

            optimizer.zero_grad()
            if args.model == 'autoreg':
                preds = model._dec_forward(sents, None, True)
                nll_autoreg = sum([
                    criterion(preds[:, l], sents[:, l + 1])
                    for l in range(length)
                ])
                train_nll_autoreg += nll_autoreg.data[0] * batch_size
                nll_autoreg.backward()
            elif args.model == 'svi':
                mean_svi = Variable(
                    0.1 * torch.zeros(batch_size, args.latent_dim).cuda(),
                    requires_grad=True)
                logvar_svi = Variable(
                    0.1 * torch.zeros(batch_size, args.latent_dim).cuda(),
                    requires_grad=True)
                var_params_svi = meta_optimizer.forward(
                    [mean_svi, logvar_svi], sents, b % args.print_every == 0)
                mean_svi_final, logvar_svi_final = var_params_svi
                z_samples = model._reparameterize(mean_svi_final.detach(),
                                                  logvar_svi_final.detach())
                preds = model._dec_forward(sents, z_samples)
                nll_svi = sum([
                    criterion(preds[:, l], sents[:, l + 1])
                    for l in range(length)
                ])
                train_nll_svi += nll_svi.data[0] * batch_size
                kl_svi = utils.kl_loss_diag(mean_svi_final, logvar_svi_final)
                train_kl_svi += kl_svi.data[0] * batch_size
                var_loss = nll_svi + args.beta * kl_svi
                var_loss.backward(retain_graph=True)
            else:
                mean, logvar = model._enc_forward(sents)
                z_samples = model._reparameterize(mean, logvar)
                preds = model._dec_forward(sents, z_samples)
                nll_vae = sum([
                    criterion(preds[:, l], sents[:, l + 1])
                    for l in range(length)
                ])
                train_nll_vae += nll_vae.data[0] * batch_size
                kl_vae = utils.kl_loss_diag(mean, logvar)
                train_kl_vae += kl_vae.data[0] * batch_size
                if args.model == 'vae':
                    vae_loss = nll_vae + args.beta * kl_vae
                    vae_loss.backward(retain_graph=True)
                if args.model == 'savae':
                    var_params = torch.cat([mean, logvar], 1)
                    mean_svi = Variable(mean.data, requires_grad=True)
                    logvar_svi = Variable(logvar.data, requires_grad=True)
                    var_params_svi = meta_optimizer.forward(
                        [mean_svi, logvar_svi], sents,
                        b % args.print_every == 0)
                    mean_svi_final, logvar_svi_final = var_params_svi
                    z_samples = model._reparameterize(mean_svi_final,
                                                      logvar_svi_final)
                    preds = model._dec_forward(sents, z_samples)
                    nll_svi = sum([
                        criterion(preds[:, l], sents[:, l + 1])
                        for l in range(length)
                    ])
                    train_nll_svi += nll_svi.data[0] * batch_size
                    kl_svi = utils.kl_loss_diag(mean_svi_final,
                                                logvar_svi_final)
                    train_kl_svi += kl_svi.data[0] * batch_size
                    var_loss = nll_svi + args.beta * kl_svi
                    var_loss.backward(retain_graph=True)
                    if args.train_n2n == 0:
                        if args.train_kl == 1:
                            mean_final = mean_svi_final.detach()
                            logvar_final = logvar_svi_final.detach()
                            kl_init_final = utils.kl_loss(
                                mean, logvar, mean_final, logvar_final)
                            train_kl_init_final += kl_init_final.data[
                                0] * batch_size
                            kl_init_final.backward(retain_graph=True)
                        else:
                            vae_loss = nll_vae + args.beta * kl_vae
                            var_param_grads = torch.autograd.grad(
                                vae_loss, [mean, logvar], retain_graph=True)
                            var_param_grads = torch.cat(var_param_grads, 1)
                            var_params.backward(var_param_grads,
                                                retain_graph=True)
                    else:
                        var_param_grads = meta_optimizer.backward(
                            [mean_svi_final.grad, logvar_svi_final.grad],
                            b % args.print_every == 0)
                        var_param_grads = torch.cat(var_param_grads, 1)
                        var_params.backward(var_param_grads)
            if args.max_grad_norm > 0:
                torch.nn.utils.clip_grad_norm(model.parameters(),
                                              args.max_grad_norm)
            optimizer.step()
            num_sents += batch_size
            num_words += batch_size * length

            if b % args.print_every == 0:
                param_norm = sum([p.norm()**2
                                  for p in model.parameters()]).data[0]**0.5
                logger.info(
                    'Iters: %d, Epoch: %d, Batch: %d/%d, LR: %.4f, TrainARNLL: %.4f, TrainARPPL: %.2f, TrainVAE_NLL: %.4f, TrainVAE_REC: %.4f, TrainVAE_KL: %.4f, TrainVAE_PPL: %.2f, TrainSVI_NLL: %.2f, TrainSVI_REC: %.2f, TrainSVI_KL: %.4f, TrainSVI_PPL: %.2f, KLInitFinal: %.2f, |Param|: %.4f, BestValPerf: %.2f, BestEpoch: %d, Beta: %.4f, Throughput: %.2f examples/sec'
                    % (t, epoch, b + 1, len(train_data), args.lr,
                       train_nll_autoreg / num_sents,
                       np.exp(train_nll_autoreg / num_words),
                       (train_nll_vae + train_kl_vae) / num_sents,
                       train_nll_vae / num_sents, train_kl_vae / num_sents,
                       np.exp((train_nll_vae + train_kl_vae) / num_words),
                       (train_nll_svi + train_kl_svi) / num_sents,
                       train_nll_svi / num_sents, train_kl_svi / num_sents,
                       np.exp((train_nll_svi + train_kl_svi) / num_words),
                       train_kl_init_final / num_sents, param_norm,
                       best_val_nll, best_epoch, args.beta, num_sents /
                       (time.time() - start_time)))

        epoch_train_time = time.time() - start_time
        logger.info('Time Elapsed: %.1fs' % epoch_train_time)

        logger.info('--------------------------------')
        logger.info('Checking validation perf...')
        logger.record_tabular('Epoch', epoch)
        logger.record_tabular('Mode', 'Val')
        logger.record_tabular('LR', args.lr)
        logger.record_tabular('Epoch Train Time', epoch_train_time)
        val_nll = eval(val_data, model, meta_optimizer)
        val_stats.append(val_nll)

        logger.info('--------------------------------')
        logger.info('Checking test perf...')
        logger.record_tabular('Epoch', epoch)
        logger.record_tabular('Mode', 'Test')
        logger.record_tabular('LR', args.lr)
        logger.record_tabular('Epoch Train Time', epoch_train_time)
        test_nll = eval(test_data, model, meta_optimizer)

        if val_nll < best_val_nll:
            best_val_nll = val_nll
            best_epoch = epoch
            model.cpu()
            checkpoint = {
                'args': args.__dict__,
                'model': model,
                'val_stats': val_stats
            }
            logger.info('Save checkpoint to %s' % checkpoint_path)
            torch.save(checkpoint, checkpoint_path)
            model.cuda()
        else:
            if epoch >= args.min_epochs:
                args.decay = 1
Exemplo n.º 13
0
def main():
    # Training settings
    parser = argparse.ArgumentParser(description='PyTorch RoofNet test')
    parser.add_argument('--batch-size', type=int, default=64, metavar='N',
                        help='input batch size for training (default: 64)')
    parser.add_argument('--test-batch-size', type=int, default=1000, metavar='N',
                        help='input batch size for testing (default: 1000)')
    parser.add_argument('--epochs', type=int, default=10, metavar='N',
                        help='number of epochs to train (default: 10)')
    parser.add_argument('--lr', type=float, default=0.01, metavar='LR',
                        help='learning rate (default: 0.01)')
    parser.add_argument('--no-cuda', action='store_true', default=False,
                        help='disables CUDA training')
    parser.add_argument('--seed', type=int, default=1, metavar='S',
                        help='random seed (default: 1)')
    parser.add_argument('--log-interval', type=int, default=10, metavar='N',
                        help='how many batches to wait before logging training status')                            
    parser.add_argument('--save-model', action='store_true', default=True,
                        help='For Saving the current Model')
    args = parser.parse_args()

    use_cuda = not args.no_cuda and torch.cuda.is_available()
    #print(torch.cuda.is_available())
    
    #torch.manual_seed(args.seed)

    device = torch.device("cuda" if use_cuda else "cpu")

    kwargs = {'num_workers': 0, 'pin_memory': True} if use_cuda else {}

    # Data preprocessing
    preprocessing = transforms.Compose([
                transforms.Resize((224, 224)),  
                transforms.ToTensor(),
                transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
                ])

    # Dataset loading and splitting
    data, weights = load_dataset(data_type = 'train', verified = True)
    validation_split = .05
    random_seed= 42
    shuffle_dataset = True

    # Creating data indices for training and validation splits:
    dataset_size = len(data)
    split = int(np.floor(validation_split * dataset_size))
    # Shuffle data
    if shuffle_dataset :
        np.random.seed(random_seed)
        np.random.shuffle(data)
    train_data, val_data = data[split:], data[:split]   
    train_dataset, val_dataset = Dataset(train_data, preprocessing), Dataset(val_data, preprocessing, train = False)
    #full_dataset = Dataset(data, preprocessing)
    #train_size = int(0.8 * len(full_dataset))
    #val_size = len(full_dataset) - train_size
    #train_dataset, val_dataset = torch.utils.data.random_split(full_dataset, [train_size, val_size])

    # Setting batch data loaders
    train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=64, shuffle=True, **kwargs)
    val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=1, shuffle=True, **kwargs)

    # Creating model and loading on gpu
    #model = RoofNet().to(device)
    model = RoofEnsemble().to(device)
    print(model)
    model = nn.DataParallel(model) #As multi-gpu in Keras

    # Optimizer
    #optimizer = optim.ASGD(model.parameters(), lr=0.01, lambd=0.0001, alpha=0.75, t0=1000000.0, weight_decay=0)
    #optimizer = optim.RMSprop(model.parameters(), lr=0.01, alpha=0.99, eps=1e-08, weight_decay=0, momentum=0, centered=False)
    optimizer = optim.Adam(model.parameters(), lr=0.001, betas=(0.9, 0.999), eps=1e-08, weight_decay=0, amsgrad=False)

    # Class weights
    weights = torch.from_numpy(weights).to(device)

    best = 100
    writer = SummaryWriter(log_dir='./logs/5', max_queue=1) # max_queue=1 to flush data at every add

    # Training and testing phase
    for epoch in range(1, args.epochs + 1):
        train_loss, train_acc = train(args, model, device, train_loader, optimizer, epoch, weights)
        val_loss, test_acc = test(args, model, device, val_loader, weights)
        if val_loss < best:
            torch.save(model.module.state_dict(), 'roof_cnn_best.pt')
            best = val_loss
        writer.add_scalar('Loss/train', train_loss, epoch)
        writer.add_scalar('Loss/test', val_loss, epoch)
        writer.add_scalar('Acc/train', train_acc, epoch)
        writer.add_scalar('Acc/test', test_acc, epoch)


    # Saving model
    if (args.save_model):
        torch.save(model.module.state_dict(), 'roof_cnn.pt')
Exemplo n.º 14
0
from inception_v3 import inception_v3
from data import Dataset
import tensorflow as tf
import cv2
import numpy as np

# Constants
IMAGE_WIDTH = 299
IMAGE_HEIGHT = 299
EPOCHS = 100
BATCH_SIZE = 50

# Load dataset
ndsb = Dataset('train', IMAGE_HEIGHT, IMAGE_WIDTH)
ndsb.read_data()
num_classes = ndsb.num_classes

# Placeholder inputs and output
inputs = tf.placeholder(tf.float32, [None, IMAGE_HEIGHT, IMAGE_WIDTH, 3])
predict = tf.placeholder(tf.float32, [None, num_classes])

# Get model
y_conv, some = inception_v3(inputs, num_classes=num_classes)

print(y_conv.shape)
print(predict.shape)

# Cross entropy graph
cross_entropy = tf.reduce_mean(
    tf.nn.softmax_cross_entropy_with_logits(labels=predict, logits=y_conv))
Exemplo n.º 15
0
def main():
    args = parser.parse_args()

    # create model
    model = create_model(args.model,
                         num_classes=args.num_classes,
                         in_chans=3,
                         pretrained=args.pretrained,
                         checkpoint_path=args.checkpoint)

    print('Model %s created, param count: %d' %
          (args.model, sum([m.numel() for m in model.parameters()])))

    config = resolve_data_config(model, args)
    model, test_time_pool = apply_test_time_pool(model, config, args)

    if args.num_gpu > 1:
        model = torch.nn.DataParallel(model,
                                      device_ids=list(range(
                                          args.num_gpu))).cuda()
    else:
        model = model.cuda()

    loader = create_loader(
        Dataset(args.data),
        input_size=config['input_size'],
        batch_size=args.batch_size,
        use_prefetcher=True,
        interpolation=config['interpolation'],
        mean=config['mean'],
        std=config['std'],
        num_workers=args.workers,
        crop_pct=1.0 if test_time_pool else config['crop_pct'])

    model.eval()

    batch_time = AverageMeter()
    end = time.time()
    top5_ids = []
    with torch.no_grad():
        for batch_idx, (input, _) in enumerate(loader):
            input = input.cuda()
            labels = model(input)
            top5 = labels.topk(5)[1]
            top5_ids.append(top5.cpu().numpy())

            # measure elapsed time
            batch_time.update(time.time() - end)
            end = time.time()

            if batch_idx % args.print_freq == 0:
                print(
                    'Predict: [{0}/{1}]\t'
                    'Time {batch_time.val:.3f} ({batch_time.avg:.3f})'.format(
                        batch_idx, len(loader), batch_time=batch_time))

    top5_ids = np.concatenate(top5_ids, axis=0).squeeze()

    with open(os.path.join(args.output_dir, './top5_ids.csv'),
              'w') as out_file:
        filenames = loader.dataset.filenames()
        for filename, label in zip(filenames, top5_ids):
            filename = os.path.basename(filename)
            out_file.write('{0},{1},{2},{3},{4},{5}\n'.format(
                filename, label[0], label[1], label[2], label[3], label[4]))
Exemplo n.º 16
0
def train(params):
    # status measure
    if params.recorder.estop or \
            params.recorder.epoch > params.epoches or \
            params.recorder.step > params.max_training_steps:
        tf.logging.info("Stop condition reached, you have finished training your model.")
        return 0.

    # loading dataset
    tf.logging.info("Begin Loading Training and Dev Dataset")
    start_time = time.time()
    train_dataset = Dataset(params.src_train_file, params.tgt_train_file,
                            params.src_vocab, params.tgt_vocab, params.max_len,
                            batch_or_token=params.batch_or_token,
                            data_leak_ratio=params.data_leak_ratio)
    dev_dataset = Dataset(params.src_dev_file, params.src_dev_file,
                          params.src_vocab, params.src_vocab, params.eval_max_len,
                          batch_or_token='batch',
                          data_leak_ratio=params.data_leak_ratio)
    tf.logging.info(
        "End Loading dataset, within {} seconds".format(time.time() - start_time))

    # Build Graph
    with tf.Graph().as_default():
        lr = tf.placeholder(tf.as_dtype(dtype.floatx()), [], "learn_rate")

        # shift automatically sliced multi-gpu process into `zero` manner :)
        features = []
        for fidx in range(max(len(params.gpus), 1)):
            feature = {
                "source": tf.placeholder(tf.int32, [None, None], "source"),
                "target": tf.placeholder(tf.int32, [None, None], "target"),
            }
            features.append(feature)

        # session info
        sess = util.get_session(params.gpus)

        tf.logging.info("Begining Building Training Graph")
        start_time = time.time()

        # create global step
        global_step = tf.train.get_or_create_global_step()

        # set up optimizer
        optimizer = tf.train.AdamOptimizer(lr,
                                           beta1=params.beta1,
                                           beta2=params.beta2,
                                           epsilon=params.epsilon)

        # get graph
        graph = model.get_model(params.model_name)

        # set up training graph
        loss, gradients = tower_train_graph(features, optimizer, graph, params)

        # apply pseudo cyclic parallel operation
        vle, ops = cycle.create_train_op({"loss": loss}, gradients,
                                         optimizer, global_step, params)

        tf.logging.info("End Building Training Graph, within {} seconds".format(time.time() - start_time))

        tf.logging.info("Begin Building Inferring Graph")
        start_time = time.time()

        # set up infer graph
        eval_seqs, eval_scores = tower_infer_graph(features, graph, params)

        tf.logging.info("End Building Inferring Graph, within {} seconds".format(time.time() - start_time))

        # initialize the model
        sess.run(tf.global_variables_initializer())

        # log parameters
        util.variable_printer()

        # create saver
        train_saver = saver.Saver(
            checkpoints=params.checkpoints,
            output_dir=params.output_dir,
            best_checkpoints=params.best_checkpoints,
        )

        tf.logging.info("Training")
        cycle_counter = 0
        data_on_gpu = []
        cum_tokens = []

        # restore parameters
        tf.logging.info("Trying restore pretrained parameters")
        train_saver.restore(sess, path=params.pretrained_model)

        tf.logging.info("Trying restore existing parameters")
        train_saver.restore(sess)

        # setup learning rate
        params.lrate = params.recorder.lrate
        adapt_lr = lrs.get_lr(params)

        start_time = time.time()
        start_epoch = params.recorder.epoch
        for epoch in range(start_epoch, params.epoches + 1):

            params.recorder.epoch = epoch

            tf.logging.info("Training the model for epoch {}".format(epoch))
            size = params.batch_size if params.batch_or_token == 'batch' \
                else params.token_size

            train_queue = queuer.EnQueuer(
                train_dataset.batcher(size,
                                      buffer_size=params.buffer_size,
                                      shuffle=params.shuffle_batch,
                                      train=True),
                lambda x: x,
                worker_processes_num=params.process_num,
                input_queue_size=params.input_queue_size,
                output_queue_size=params.output_queue_size,
            )

            adapt_lr.before_epoch(eidx=epoch)

            for lidx, data in enumerate(train_queue):

                if params.train_continue:
                    if lidx <= params.recorder.lidx:
                        segments = params.recorder.lidx // 5
                        if params.recorder.lidx < 5 or lidx % segments == 0:
                            tf.logging.info(
                                "{} Passing {}-th index according to record".format(util.time_str(time.time()), lidx))

                        continue

                params.recorder.lidx = lidx

                data_on_gpu.append(data)
                # use multiple gpus, and data samples is not enough
                # make sure the data is fully added
                # The actual batch size: batch_size * num_gpus * update_cycle
                if len(params.gpus) > 0 and len(data_on_gpu) < len(params.gpus):
                    continue

                # increase the counter by 1
                cycle_counter += 1

                if cycle_counter == 1:
                    # calculate adaptive learning rate
                    adapt_lr.step(params.recorder.step)

                    # clear internal states
                    sess.run(ops["zero_op"])

                # data feeding to gpu placeholders
                feed_dicts = {}
                for fidx, shard_data in enumerate(data_on_gpu):
                    # define feed_dict
                    feed_dict = {
                        features[fidx]["source"]: shard_data["src"],
                        features[fidx]["target"]: shard_data["tgt"],
                        lr: adapt_lr.get_lr(),
                    }
                    feed_dicts.update(feed_dict)

                    # collect target tokens
                    cum_tokens.append(np.sum(shard_data['tgt'] > 0))

                # reset data points on gpus
                data_on_gpu = []

                # internal accumulative gradient collection
                if cycle_counter < params.update_cycle:
                    sess.run(ops["collect_op"], feed_dict=feed_dicts)

                # at the final step, update model parameters
                if cycle_counter == params.update_cycle:
                    cycle_counter = 0

                    # directly update parameters, usually this works well
                    if not params.safe_nan:
                        _, loss, gnorm, pnorm, gstep = sess.run(
                            [ops["train_op"], vle["loss"], vle["gradient_norm"], vle["parameter_norm"],
                             global_step], feed_dict=feed_dicts)

                        if np.isnan(loss) or np.isinf(loss) or np.isnan(gnorm) or np.isinf(gnorm):
                            tf.logging.error("Nan or Inf raised! Loss {} GNorm {}.".format(loss, gnorm))
                            params.recorder.estop = True
                            break
                    else:
                        # Notice, applying safe nan can help train the big model, but sacrifice speed
                        loss, gnorm, pnorm, gstep = sess.run(
                            [vle["loss"], vle["gradient_norm"], vle["parameter_norm"], global_step],
                            feed_dict=feed_dicts)

                        if np.isnan(loss) or np.isinf(loss) or np.isnan(gnorm) or np.isinf(gnorm) \
                                or gnorm > params.gnorm_upper_bound:
                            tf.logging.error(
                                "Nan or Inf raised, GStep {} is passed! Loss {} GNorm {}.".format(gstep, loss, gnorm))
                            continue

                        sess.run(ops["train_op"], feed_dict=feed_dicts)

                    if gstep % params.disp_freq == 0:
                        end_time = time.time()
                        tf.logging.info(
                            "{} Epoch {}, GStep {}~{}, LStep {}~{}, "
                            "Loss {:.3f}, GNorm {:.3f}, PNorm {:.3f}, Lr {:.5f}, "
                            "Src {}, Tgt {}, Tokens {}, UD {:.3f} s".format(
                                util.time_str(end_time), epoch,
                                gstep - params.disp_freq + 1, gstep,
                                lidx - params.disp_freq + 1, lidx,
                                loss, gnorm, pnorm,
                                adapt_lr.get_lr(), data['src'].shape, data['tgt'].shape,
                                np.sum(cum_tokens), end_time - start_time)
                        )
                        start_time = time.time()
                        cum_tokens = []

                    # trigger model saver
                    if gstep > 0 and gstep % params.save_freq == 0:
                        train_saver.save(sess, gstep)
                        params.recorder.save_to_json(os.path.join(params.output_dir, "record.json"))

                    # trigger model evaluation
                    if gstep > 0 and gstep % params.eval_freq == 0:
                        if params.ema_decay > 0.:
                            sess.run(ops['ema_backup_op'])
                            sess.run(ops['ema_assign_op'])

                        tf.logging.info("Start Evaluating")
                        eval_start_time = time.time()
                        tranes, scores, indices = evalu.decoding(
                            sess, features, eval_seqs,
                            eval_scores, dev_dataset, params)
                        bleu = evalu.eval_metric(tranes, params.tgt_dev_file, indices=indices)
                        eval_end_time = time.time()
                        tf.logging.info("End Evaluating")

                        if params.ema_decay > 0.:
                            sess.run(ops['ema_restore_op'])

                        tf.logging.info(
                            "{} GStep {}, Scores {}, BLEU {}, Duration {:.3f} s".format(
                                util.time_str(eval_end_time), gstep, np.mean(scores),
                                bleu, eval_end_time - eval_start_time)
                        )

                        # save eval translation
                        evalu.dump_tanslation(
                            tranes,
                            os.path.join(params.output_dir, "eval-{}.trans.txt".format(gstep)),
                            indices=indices)

                        # save parameters
                        train_saver.save(sess, gstep, bleu)

                        # check for early stopping
                        valid_scores = [v[1] for v in params.recorder.valid_script_scores]
                        if len(valid_scores) == 0 or bleu > np.max(valid_scores):
                            params.recorder.bad_counter = 0
                        else:
                            params.recorder.bad_counter += 1

                            if params.recorder.bad_counter > params.estop_patience:
                                params.recorder.estop = True
                                break

                        params.recorder.history_scores.append((gstep, float(np.mean(scores))))
                        params.recorder.valid_script_scores.append((gstep, float(bleu)))
                        params.recorder.save_to_json(os.path.join(params.output_dir, "record.json"))

                        # handle the learning rate decay in a typical manner
                        adapt_lr.after_eval(float(bleu))

                    # trigger temporary sampling
                    if gstep > 0 and gstep % params.sample_freq == 0:
                        tf.logging.info("Start Sampling")
                        decode_seqs, decode_scores = sess.run(
                            [eval_seqs[:1], eval_scores[:1]], feed_dict={features[0]["source"]: data["src"][:5]})
                        tranes, scores = evalu.decode_hypothesis(decode_seqs, decode_scores, params)

                        for sidx in range(min(5, len(scores))):
                            sample_source = evalu.decode_target_token(data['src'][sidx], params.src_vocab)
                            tf.logging.info("{}-th Source: {}".format(sidx, ' '.join(sample_source)))
                            sample_target = evalu.decode_target_token(data['tgt'][sidx], params.tgt_vocab)
                            tf.logging.info("{}-th Target: {}".format(sidx, ' '.join(sample_target)))
                            sample_trans = tranes[sidx]
                            tf.logging.info("{}-th Translation: {}".format(sidx, ' '.join(sample_trans)))

                        tf.logging.info("End Sampling")

                    # trigger stopping
                    if gstep >= params.max_training_steps:
                        # stop running by setting EStop signal
                        params.recorder.estop = True
                        break

                    # should be equal to global_step
                    params.recorder.step = gstep

            if params.recorder.estop:
                tf.logging.info("Early Stopped!")
                break

            # reset to 0
            params.recorder.lidx = -1

            adapt_lr.after_epoch(eidx=epoch)

    # Final Evaluation
    tf.logging.info("Start Final Evaluating")
    if params.ema_decay > 0.:
        sess.run(ops['ema_backup_op'])
        sess.run(ops['ema_assign_op'])

    gstep = int(params.recorder.step + 1)
    eval_start_time = time.time()
    tranes, scores, indices = evalu.decoding(sess, features, eval_seqs, eval_scores, dev_dataset, params)
    bleu = evalu.eval_metric(tranes, params.tgt_dev_file, indices=indices)
    eval_end_time = time.time()
    tf.logging.info("End Evaluating")

    if params.ema_decay > 0.:
        sess.run(ops['ema_restore_op'])

    tf.logging.info(
        "{} GStep {}, Scores {}, BLEU {}, Duration {:.3f} s".format(
            util.time_str(eval_end_time), gstep, np.mean(scores), bleu, eval_end_time - eval_start_time)
    )

    # save eval translation
    evalu.dump_tanslation(
        tranes,
        os.path.join(params.output_dir, "eval-{}.trans.txt".format(gstep)),
        indices=indices)

    tf.logging.info("Your training is finished :)")

    return train_saver.best_score
Exemplo n.º 17
0
 def __init__(self, config=defaults, name=None):
     self.config = config
     self.dataset = Dataset(name=name)
     self.document = Document()
     self.texts = []
     self.tags = []
Exemplo n.º 18
0
def scorer(params):
    # loading dataset
    tf.logging.info("Begin Loading Test Dataset")
    start_time = time.time()
    test_dataset = Dataset(params.src_test_file, params.tgt_test_file,
                           params.src_vocab, params.tgt_vocab, params.eval_max_len,
                           batch_or_token='batch',
                           data_leak_ratio=params.data_leak_ratio)
    tf.logging.info(
        "End Loading dataset, within {} seconds".format(time.time() - start_time))

    # Build Graph
    with tf.Graph().as_default():
        features = []
        for fidx in range(max(len(params.gpus), 1)):
            feature = {
                "source": tf.placeholder(tf.int32, [None, None], "source"),
                "target": tf.placeholder(tf.int32, [None, None], "target"),
            }
            features.append(feature)

        # session info
        sess = util.get_session(params.gpus)

        tf.logging.info("Begining Building Evaluation Graph")
        start_time = time.time()

        # get graph
        graph = model.get_model(params.model_name)

        # set up infer graph
        eval_scores = tower_score_graph(features, graph, params)

        tf.logging.info("End Building Inferring Graph, within {} seconds".format(time.time() - start_time))

        # set up ema
        if params.ema_decay > 0.:
            # recover from EMA
            ema = tf.train.ExponentialMovingAverage(decay=params.ema_decay)
            ema.apply(tf.trainable_variables())
            ema_assign_op = tf.group(*(tf.assign(var, ema.average(var).read_value())
                                       for var in tf.trainable_variables()))
        else:
            ema_assign_op = tf.no_op()

        # initialize the model
        sess.run(tf.global_variables_initializer())

        # log parameters
        util.variable_printer()

        # create saver
        eval_saver = saver.Saver(checkpoints=params.checkpoints, output_dir=params.output_dir)

        # restore parameters
        tf.logging.info("Trying restore existing parameters")
        eval_saver.restore(sess, params.output_dir)
        sess.run(ema_assign_op)

        tf.logging.info("Starting Evaluating")
        eval_start_time = time.time()
        scores, ppl = evalu.scoring(sess, features, eval_scores, test_dataset, params)
        eval_end_time = time.time()

        tf.logging.info(
            "{} Scores {}, PPL {}, Duration {}s".format(
                util.time_str(eval_end_time), np.mean(scores), ppl, eval_end_time - eval_start_time)
        )

        # save translation
        evalu.dump_tanslation(scores, params.test_output)

    return np.mean(scores)
Exemplo n.º 19
0
def main():
    args = parser.parse_args()
    print(args)

    if args.img_size is None:
        args.img_size, args.crop_pct = get_image_size_crop_pct(args.model)

    if not args.checkpoint and not args.pretrained:
        args.pretrained = True

    if args.torchscript:
        geffnet.config.set_scriptable(True)

    # create model
    model = geffnet.create_model(args.model,
                                 num_classes=args.num_classes,
                                 in_chans=3,
                                 pretrained=args.pretrained,
                                 checkpoint_path=args.checkpoint)

    if args.torchscript:
        torch.jit.optimized_execution(True)
        model = torch.jit.script(model)

    print('Model %s created, param count: %d' %
          (args.model, sum([m.numel() for m in model.parameters()])))

    data_config = resolve_data_config(model, args)

    criterion = nn.CrossEntropyLoss()

    if not args.no_cuda:
        if args.num_gpu > 1:
            model = torch.nn.DataParallel(model,
                                          device_ids=list(range(
                                              args.num_gpu))).cuda()
        else:
            model = model.cuda()
        criterion = criterion.cuda()

    if args.tune:
        model.eval()
        model.fuse_model()
        conf_yaml = "conf_" + args.model + ".yaml"
        from lpot.experimental import Quantization, common
        quantizer = Quantization(conf_yaml)
        quantizer.model = common.Model(model)
        q_model = quantizer()
        q_model.save(args.tuned_checkpoint)
        exit(0)

    valdir = os.path.join(args.data, 'val')
    loader = create_loader(Dataset(valdir, load_bytes=args.tf_preprocessing),
                           input_size=data_config['input_size'],
                           batch_size=args.batch_size,
                           use_prefetcher=not args.no_cuda,
                           interpolation=data_config['interpolation'],
                           mean=data_config['mean'],
                           std=data_config['std'],
                           num_workers=args.workers,
                           crop_pct=data_config['crop_pct'],
                           tensorflow_preprocessing=args.tf_preprocessing)

    batch_time = AverageMeter()
    losses = AverageMeter()
    top1 = AverageMeter()
    top5 = AverageMeter()

    model.eval()
    model.fuse_model()
    if args.int8:
        from lpot.utils.pytorch import load
        new_model = load(
            os.path.abspath(os.path.expanduser(args.tuned_checkpoint)), model)
    else:
        new_model = model

    with torch.no_grad():
        for i, (input, target) in enumerate(loader):
            if i >= args.warmup_iterations:
                start = time.time()
            if not args.no_cuda:
                target = target.cuda()
                input = input.cuda()

            # compute output
            output = new_model(input)
            loss = criterion(output, target)

            # measure accuracy and record loss
            prec1, prec5 = accuracy(output.data, target, topk=(1, 5))
            losses.update(loss.item(), input.size(0))
            top1.update(prec1.item(), input.size(0))
            top5.update(prec5.item(), input.size(0))

            if i >= args.warmup_iterations:
                # measure elapsed time
                batch_time.update(time.time() - start)

            if i % args.print_freq == 0:
                print(
                    'Test: [{0}/{1}]\t'
                    'Time {batch_time.val:.3f} ({batch_time.avg:.3f}, {rate_avg:.3f}/s) \t'
                    'Loss {loss.val:.4f} ({loss.avg:.4f})\t'
                    'Prec@1 {top1.val:.3f} ({top1.avg:.3f})\t'
                    'Prec@5 {top5.val:.3f} ({top5.avg:.3f})'.format(
                        i,
                        len(loader),
                        batch_time=batch_time,
                        rate_avg=input.size(0) / batch_time.avg,
                        loss=losses,
                        top1=top1,
                        top5=top5))
            if args.iterations > 0 and i >= args.iterations + args.warmup_iterations - 1:
                break

        print('Batch size = %d' % args.batch_size)
        if args.batch_size == 1:
            print('Latency: %.3f ms' % (batch_time.avg * 1000))
        print('Throughput: %.3f images/sec' %
              (args.batch_size / batch_time.avg))
        print('Accuracy: {top1:.5f} Accuracy@5 {top5:.5f}'.format(
            top1=(top1.avg / 100), top5=(top5.avg / 100)))
Exemplo n.º 20
0
def ensemble(total_params):
    # loading dataset
    tf.logging.info("Begin Loading Test Dataset")
    start_time = time.time()
    # assume that different configurations use the same test file
    default_params = total_params[0]
    # assume that different models share the same source and target vocabulary, usually it's the case
    test_dataset = Dataset(default_params.src_test_file, default_params.src_test_file,
                           default_params.src_vocab, default_params.src_vocab, default_params.eval_max_len,
                           batch_or_token='batch',
                           data_leak_ratio=default_params.data_leak_ratio)
    tf.logging.info(
        "End Loading dataset, within {} seconds".format(time.time() - start_time))

    # Build Graph
    with tf.Graph().as_default():
        features = []
        for fidx in range(max(len(default_params.gpus), 1)):
            feature = {
                "source": tf.placeholder(tf.int32, [None, None], "source"),
            }
            features.append(feature)

        # session info
        sess = util.get_session(default_params.gpus)

        tf.logging.info("Begining Building Evaluation Graph")
        start_time = time.time()

        # get graph
        total_graphs = [model.get_model(params.model_name) for params in total_params]

        # set up infer graph
        eval_seqs, eval_scores = tower_ensemble_graph(features, total_graphs, total_params)

        tf.logging.info("End Building Inferring Graph, within {} seconds".format(time.time() - start_time))

        # set up ema
        # collect ema variables
        ema_used_models = {}
        for midx, params in enumerate(total_params):
            if params.ema_decay > 0.:
                ema_used_models[params.scope_name + "_ensembler_%d" % midx] = []

        for var in tf.trainable_variables():
            name = var.op.name

            key = name[:name.find('/')]

            if key in ema_used_models:
                ema_used_models[key].append(var)

        ema_assign_list = [tf.no_op()]
        for midx, params in enumerate(total_params):
            if params.ema_decay > 0.:
                key = params.scope_name + "_ensembler_%d" % midx

                ema = tf.train.ExponentialMovingAverage(decay=params.ema_decay)
                ema.apply(ema_used_models[key])
                ema_assign_list += [tf.assign(var, ema.average(var).read_value()) for var in ema_used_models[key]]
        ema_assign_op = tf.group(*ema_assign_list)

        # initialize the model
        sess.run(tf.global_variables_initializer())

        # log parameters
        util.variable_printer()

        # restore parameters
        tf.logging.info("Trying restore existing parameters")
        all_var_list = {}
        for midx, params in enumerate(total_params):
            checkpoint = os.path.join(params.output_dir, "checkpoint")
            assert tf.gfile.Exists(checkpoint)

            latest_checkpoint = tf.gfile.Open(checkpoint).readline()
            model_name = latest_checkpoint.strip().split(":")[1].strip()
            model_name = model_name[1:-1]  # remove ""
            model_path = os.path.join(params.output_dir, model_name)
            model_path = os.path.abspath(model_path)

            assert tf.gfile.Exists(model_path + ".meta")

            tf.logging.warn("Starting Backup Restore {}-th Model".format(midx))

            reader = tf.train.load_checkpoint(model_path)

            # adapt the model names
            for name, shape in tf.train.list_variables(model_path):
                model_name = name.split('/')[0]
                ensemble_name = "{}_ensembler_{}/{}".format(model_name, midx, name[name.find('/') + 1:])
                all_var_list[ensemble_name] = reader.get_tensor(name)

        ops = []
        for var in tf.global_variables():
            name = var.op.name

            if name in all_var_list:
                tf.logging.info('{} **Good**'.format(name))
                ops.append(
                    tf.assign(var, all_var_list[name])
                )
            else:
                tf.logging.warn("{} --Bad--".format(name))
        restore_op = tf.group(*ops, name="restore_global_vars")

        sess.run(restore_op)
        sess.run(ema_assign_op)

        tf.logging.info("Starting Evaluating")
        eval_start_time = time.time()
        tranes, scores, indices = evalu.decoding(sess, features, eval_seqs, eval_scores, test_dataset, default_params)
        bleu = evalu.eval_metric(tranes, default_params.tgt_test_file, indices=indices)
        eval_end_time = time.time()

        tf.logging.info(
            "{} Scores {}, BLEU {}, Duration {}s".format(
                util.time_str(eval_end_time),
                        np.mean(scores), bleu, eval_end_time - eval_start_time)
        )

        # save translation
        evalu.dump_tanslation(tranes, default_params.test_output, indices=indices)

    return bleu
Exemplo n.º 21
0
def main():
    args = parser.parse_args()

    if not args.checkpoint and not args.pretrained:
        args.pretrained = True

    amp_autocast = suppress  # do nothing
    if args.amp:
        if not has_native_amp:
            print(
                "Native Torch AMP is not available (requires torch >= 1.6), using FP32."
            )
        else:
            amp_autocast = torch.cuda.amp.autocast

    # create model
    model = geffnet.create_model(args.model,
                                 num_classes=args.num_classes,
                                 in_chans=3,
                                 pretrained=args.pretrained,
                                 checkpoint_path=args.checkpoint,
                                 scriptable=args.torchscript)

    if args.channels_last:
        model = model.to(memory_format=torch.channels_last)

    if args.torchscript:
        torch.jit.optimized_execution(True)
        model = torch.jit.script(model)

    print('Model %s created, param count: %d' %
          (args.model, sum([m.numel() for m in model.parameters()])))

    data_config = resolve_data_config(model, args)

    criterion = nn.CrossEntropyLoss()

    if not args.no_cuda:
        if args.num_gpu > 1:
            model = torch.nn.DataParallel(model,
                                          device_ids=list(range(
                                              args.num_gpu))).cuda()
        else:
            model = model.cuda()
        criterion = criterion.cuda()

    loader = create_loader(Dataset(args.data,
                                   load_bytes=args.tf_preprocessing),
                           input_size=data_config['input_size'],
                           batch_size=args.batch_size,
                           use_prefetcher=not args.no_cuda,
                           interpolation=data_config['interpolation'],
                           mean=data_config['mean'],
                           std=data_config['std'],
                           num_workers=args.workers,
                           crop_pct=data_config['crop_pct'],
                           tensorflow_preprocessing=args.tf_preprocessing)

    batch_time = AverageMeter()
    losses = AverageMeter()
    top1 = AverageMeter()
    top5 = AverageMeter()

    model.eval()
    end = time.time()
    with torch.no_grad():
        for i, (input, target) in enumerate(loader):
            if not args.no_cuda:
                target = target.cuda()
                input = input.cuda()
            if args.channels_last:
                input = input.contiguous(memory_format=torch.channels_last)

            # compute output
            with amp_autocast():
                output = model(input)
                loss = criterion(output, target)

            # measure accuracy and record loss
            prec1, prec5 = accuracy(output.data, target, topk=(1, 5))
            losses.update(loss.item(), input.size(0))
            top1.update(prec1.item(), input.size(0))
            top5.update(prec5.item(), input.size(0))

            # measure elapsed time
            batch_time.update(time.time() - end)
            end = time.time()

            if i % args.print_freq == 0:
                print(
                    'Test: [{0}/{1}]\t'
                    'Time {batch_time.val:.3f} ({batch_time.avg:.3f}, {rate_avg:.3f}/s) \t'
                    'Loss {loss.val:.4f} ({loss.avg:.4f})\t'
                    'Prec@1 {top1.val:.3f} ({top1.avg:.3f})\t'
                    'Prec@5 {top5.val:.3f} ({top5.avg:.3f})'.format(
                        i,
                        len(loader),
                        batch_time=batch_time,
                        rate_avg=input.size(0) / batch_time.avg,
                        loss=losses,
                        top1=top1,
                        top5=top5))

    print(
        ' * Prec@1 {top1.avg:.3f} ({top1a:.3f}) Prec@5 {top5.avg:.3f} ({top5a:.3f})'
        .format(top1=top1,
                top1a=100 - top1.avg,
                top5=top5,
                top5a=100. - top5.avg))
Exemplo n.º 22
0
def load_dataset():
    dataset = Dataset(FLAGS.data_path, verbose=True)
    dataset_size = len(dataset.samples)
    assert dataset_size > 0
    return dataset
Exemplo n.º 23
0
from data import Dataset
from options import args

import os
from model import create_model
import utils

# for k, v in vars(args).items():
#     print("{}: {}".format(k, v))

src = args.test_src
dest = os.path.join(args.test_dest, args.name)
if not os.path.isdir(dest):
    os.mkdir(dest)

dataset = Dataset(src)
loader = DataLoader(dataset,
                    batch_size=args.test_batch_size,
                    shuffle=False,
                    num_workers=args.num_workers,
                    pin_memory=True)

model = create_model(args)
model.eval()
if args.pre_train_model != "...":
    print("Loading pretrained model ... ")
    model.load_state_dict(torch.load(args.pre_train_model))

print("Testing...")
with torch.no_grad():
    for i, (name, lr) in enumerate(loader):
#test_no = df[df.iloc[:,0] == 'testing'].shape[0]
#def train(batch_size=4, nb_epoch=10):

checkpointer = ModelCheckpoint(filepath=os.path.join('data', 'checkpoints', 'lstm' + '-' + 'features' + '.{epoch:03d}-{val_loss:.3f}.hdf5'),verbose=1,save_best_only=True)


tb = TensorBoard(log_dir=os.path.join('data', 'logs', 'lstm'))

early_stopper = EarlyStopping(patience=5)

timestamp = time.time()

csv_logger = CSVLogger(os.path.join('data', 'logs', 'lstm' + '-' + 'training-' + str(timestamp) + '.log'))

data = Dataset(
        seq_length=seq,
        class_limit=2,
    )

steps_per_epoch = 4

X, y = data.get_all_sequences_in_memory('training', hyper, seq)
X_test, y_test = data.get_all_sequences_in_memory('testing', hyper, seq)

#X_test, y_test = data.get_all_sequences_in_memory('testing', cnt, seq)

rm = ResearchModels(len(data.classes),'lstm',data.seq_length, None)
print("##################################################")
#X=X[2:]
#X_test=X_test[2:]
print(X.shape)
X=np.ravel(X)
Exemplo n.º 25
0
        # cylib.collect_confusion_matrix(y_pred_np.reshape(-1),
        #                                y_np.reshape(-1), conf_mat)
        # conf_mat_all += conf_mat_np.astype(np.uint64)
        if i % 10 == 0:
            string = 'batch %03d loss = %.2f  (%.1f images/sec)' % \
              (i, loss_np, x_np.shape[0] / duration)
            print(string)
    print(conf_mat)
    return utils.print_stats(conf_mat, 'Validation', Dataset.class_info)


# BEGINING

tf.set_random_seed(31415)

train_data = Dataset('train', batch_size)
val_data = Dataset('val', batch_size, shuffle=False)

height = train_data.height
width = train_data.width
channels = train_data.channels

# x = tf.placeholder(tf.float32, shape=(batch_size, height, width, channels))
# y = tf.placeholder(tf.int32, shape=(batch_size, height, width))

# create placeholders for inputs
with tf.name_scope('data'):
    x = tf.placeholder(tf.float32,
                       shape=(None, height, width, channels),
                       name='rgb_images')
    y = tf.placeholder(tf.int32, shape=(None, height, width), name='labels')
Exemplo n.º 26
0
 def __init__(self, dataset=None):
     super(Evaluator,
           self).__init__(Dataset() if dataset is None else dataset)
Exemplo n.º 27
0
def main():
    datasets = {
        'cnn+dailymail': read_cnn_dailymail,
        'cnn': read_cnn_dailymail,
        'daily': read_cnn_dailymail,
        'duc2007': read_duc2007,
    }
    parser = argparse.ArgumentParser()
    parser.add_argument('--glove',
                        default='/data/sjx/glove.6B.100d.py36.pkl',
                        help='pickle file of glove')
    parser.add_argument('--data',
                        default='cnn+dailymail',
                        choices=datasets.keys())
    parser.add_argument(
        '--data-dir',
        default=
        '/data/share/cnn_stories/stories;/data/share/dailymail_stories/stories',
        help=
        'If data=cnn+dailimail, then data-dir must contain two paths for cnn and dailymail seperated by ;.'
    )
    parser.add_argument('--save-path', required=True)
    parser.add_argument('--max-word-num', type=int, default=50000)
    args = parser.parse_args()

    print('Loading glove......')
    glove = pickle.load(open(args.glove, 'rb'))
    word_dim = len(glove['the'])
    print('Word dim = %d' % word_dim)

    print('Reading data......')
    data, length = datasets[args.data](args.data, args.data_dir)
    print('train/valid/test: %d/%d/%d' % tuple([len(_) for _ in data]))

    print('Count word frequency only from train set......')
    wtof = {}
    if (args.data == 'duc2007'):
        pass
    else:
        for j in range(len(data[0])):  # j-th sample of train set
            for k in range(2):  # 0: content, 1: summary
                for l in range(len(data[0][j][k])):  # l-th sentence
                    for word in data[0][j][k][l]:
                        wtof[word] = wtof.get(word, 0) + 1
        wtof = Counter(wtof).most_common(args.max_word_num)
        needed_words = {w[0]: w[1] for w in wtof}
        # print('Preserve word num: %d. Examples: %s %s' % (len(needed_words), wtof[0][0], wtof[1][0]))

    itow = ['<pad>', '<unk>']
    wtoi = {'<pad>': 0, '<unk>': 1}
    count = 2
    glove['<pad>'] = np.zeros((word_dim, ))
    glove['<unk>'] = np.zeros((word_dim, ))
    missing_word_neighbors = {}

    print('Replace word string with word index......')
    if (args.data == 'duc2007'):
        cnn_data = Dataset(path='/data/c-liang/data/cnndaily_5w_100d.pkl')
        needed_words = cnn_data.wtoi
        wtoi = cnn_data.wtoi
        itow = cnn_data.itow
        for i in range(len(data)):
            for j in range(len(data[i])):
                for k in range(2):  # 0: content, 1: summary
                    max_len = max([len(s) for s in data[i][j][k]
                                   ])  # max length of sentences for padding
                    for l in range(len(data[i][j][k])):  # l-th sentence
                        for m, word in enumerate(
                                data[i][j][k][l]):  # m-th word
                            if word not in wtoi:
                                word = '<unk>'
                            data[i][j][k][l][m] = wtoi[word]
                        data[i][j][k][l] += [0] * (max_len - len(
                            data[i][j][k][l]))  # padding l-th sentence
                    data[i][j][k] = np.asarray(data[i][j][k], dtype='int32')
                    length[i][j][k] = np.asarray(length[i][j][k],
                                                 dtype='int32')
                    # np.array for all documents/summaries
                    # shape of each document/summary: (# sentence, max length)
    else:
        for i in range(len(data)):
            for j in range(len(data[i])):
                for k in range(2):  # 0: content, 1: summary
                    max_len = max([len(s) for s in data[i][j][k]
                                   ])  # max length of sentences for padding
                    for l in range(len(data[i][j][k])):  # l-th sentence
                        for m, word in enumerate(
                                data[i][j][k][l]):  # m-th word
                            if word not in needed_words:
                                word = '<unk>'
                            elif word not in wtoi:
                                itow.append(word)
                                wtoi[word] = count
                                count += 1
                            #print(word)
                            data[i][j][k][l][m] = wtoi[word]
                            # Find neighbor vectors for those words not in glove
                            if word not in glove:
                                if word not in missing_word_neighbors:
                                    missing_word_neighbors[word] = []
                                for neighbor in data[i][j][k][l][
                                        m - 5:m + 6]:  # window size: 10
                                    if neighbor in glove:
                                        missing_word_neighbors[word].append(
                                            glove[neighbor])
                        if (max_len > len(data[i][j][k][l])):
                            data[i][j][k][l] += [0] * int(
                                max_len -
                                len(data[i][j][k][l]))  # padding l-th sentence
                    data[i][j][k] = np.asarray(data[i][j][k], dtype='int32')
                    length[i][j][k] = np.asarray(length[i][j][k],
                                                 dtype='int32')
                    # np.array for all documents/summaries
                    # shape of each document/summary: (# sentence, max length)
    print('Calculate vectors for missing words by averaging neighbors......')
    #print(data)
    if (args.data == 'duc2007'):
        weight_matrix = cnn_data.weight
    else:
        for word in missing_word_neighbors:
            vectors = missing_word_neighbors[word]
            if len(vectors) > 0:
                glove[word] = sum(vectors) / len(vectors)
            else:
                glove[word] = np.zeros((word_dim, ))
        weight_matrix = np.vstack([glove[w] for w in itow])
    print('Shape of weight matrix:')
    print(weight_matrix.shape)

    print('Dumping......')
    #print(data[2][0][0], data[2][1][0])
    save_file = open(args.save_path, 'wb')
    pickle.dump(data, save_file)
    pickle.dump(length, save_file)
    pickle.dump(weight_matrix, save_file)
    pickle.dump(wtoi, save_file)
    pickle.dump(itow, save_file)
    save_file.close()
Exemplo n.º 28
0
            if s[0] != 224:
                img = misc.imresize(img, (224, 224), 'bilinear')
            img = np.stack((img,img,img),axis =2)
            yield img, label[0]


from keras.models import load_model

model = load_model('/media/user1/model.h5')

test_features,test_label = relist(predict(test_set))

test_generator = Dataset(
    test_features,
    test_label,
    augment=False,
    shuffle=False,
    input_form='t1',
    seed=seed,
)

test_generator.reset()
test_results = evaluate.get_results(model, test_generator)
probabilities = list(evaluate.transform_binary_probabilities(test_results))
np.save('./test_slice_pro.npy',probabilities)

lg_pred = np.zeros((len(probabilities)))
for i in range(len(probabilities)):
    if probabilities[i]<0.5:
        lg_pred[i] = 0
    else:
        lg_pred[i] = 1
Exemplo n.º 29
0
def main():
    os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
    os.environ["CUDA_VISIBLE_DEVICES"] = constants.GPUS

    torch_tvm.enable(opt_level=3,
                     device_type="gpu",
                     device="cuda",
                     host="llvm")

    # Prepare input
    bc = BasketConstructor(constants.RAW_DATA_DIR, constants.FEAT_DATA_DIR)
    # Users' baskets
    ub_basket = bc.get_baskets('prior', reconstruct=False)

    if constants.REORDER:
        # Users' reordered baskets
        ub_rbks = bc.get_baskets('prior', reconstruct=False, reordered=True)
        # User's item history
        ub_ihis = bc.get_item_history('prior', reconstruct=False)
        # Train test split
        train_ub, test_ub, train_rbks, test_rbks, train_ihis, test_ihis = train_test_split(
            ub_basket, ub_rbks, ub_ihis, test_size=0.2)
        del ub_basket, ub_rbks, ub_ihis  # memory saving
        train_ub, test_ub = Dataset(train_ub, train_rbks, train_ihis), Dataset(
            test_ub, test_rbks, test_ihis)
        del train_rbks, test_rbks, train_ihis, test_ihis  # memory saving
    else:
        train_ub, test_ub = train_test_split(ub_basket, test_size=0.2)
        del ub_basket
        train_ub, test_ub = Dataset(train_ub), Dataset(test_ub)

    # Model config
    dr_config = Config(constants.DREAM_CONFIG)
    dr_model = DreamModel(dr_config)
    if dr_config.cuda:
        dr_model.cuda()

    # Optimizer
    optim = torch.optim.Adam(dr_model.parameters(), lr=dr_config.learning_rate)
    # optim = torch.optim.Adadelta(dr_model.parameters())
    # optim = torch.optim.SGD(dr_model.parameters(), lr=dr_config.learning_rate, momentum=0.9)
    writer = SummaryWriter(log_dir='runs/{}'.format(
        dr_config.alias))  # tensorboard writer
    writer.add_text('config', str(dr_config))
    best_val_loss = None

    try:
        for k, v in constants.DREAM_CONFIG.items():
            print(k, v)

        # training
        for epoch in range(dr_config.epochs):
            if constants.REORDER:
                train_reorder_dream()
            else:
                train_dream()
            print('-' * 89)
            if constants.REORDER:
                val_loss = evaluate_reorder_dream()
            else:
                val_loss = evaluate_dream()
            print('-' * 89)
            # checkpoint
            if not best_val_loss or val_loss < best_val_loss:
                with open(
                        dr_config.checkpoint_dir.format(epoch=epoch,
                                                        loss=val_loss),
                        'wb') as f:
                    torch.save(dr_model, f)
                best_val_loss = val_loss
            else:
                # Manual SGD slow down lr if no improvement in val_loss
                # dr_config.learning_rate = dr_config.learning_rate / 4
                pass

    except KeyboardInterrupt:
        print('*' * 89)
        print('Got keyboard Interrupt and stopped early')
Exemplo n.º 30
0
def main(args):
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    train_data = Dataset(args.train_file)
    val_data = Dataset(args.val_file)
    train_sents = train_data.batch_size.sum()
    vocab_size = int(train_data.vocab_size)
    print('Train data: %d batches' % len(train_data))
    print('Val data: %d batches' % len(val_data))
    print('Word vocab size: %d' % vocab_size)
    if args.slurm == 0:
        # cuda.set_device(args.gpu)
        gpu_id = 0
        device = torch.device(
            f"cuda:{gpu_id}" if torch.cuda.is_available() else "cpu")
    if args.train_from == '':
        model = RNNVAE(vocab_size=vocab_size,
                       enc_word_dim=args.enc_word_dim,
                       enc_h_dim=args.enc_h_dim,
                       enc_num_layers=args.enc_num_layers,
                       dec_word_dim=args.dec_word_dim,
                       dec_h_dim=args.dec_h_dim,
                       dec_num_layers=args.dec_num_layers,
                       dec_dropout=args.dec_dropout,
                       latent_dim=args.latent_dim,
                       mode=args.model)
        for param in model.parameters():
            param.data.uniform_(-0.1, 0.1)
    else:
        print('loading model from ' + args.train_from)
        checkpoint = torch.load(args.train_from)
        model = checkpoint['model']

    print("model architecture")
    print(model)

    optimizer = torch.optim.SGD(model.parameters(), lr=args.lr)

    if args.warmup == 0:
        args.beta = 1.
    else:
        args.beta = args.kl_start

    criterion = nn.NLLLoss(reduce=False)
    # criterion = nn.NLLLoss()
    # model.cuda()
    # criterion.cuda()
    # model = torch.nn.DataParallel(net, device_ids=[0, 1])
    model.to(device)
    criterion.to(device)
    model.train()

    def variational_loss(input, sents, model, z=None):
        mean, logvar = input
        z_samples = model._reparameterize(mean, logvar, z)
        preds = model._dec_forward(sents, z_samples)
        nll = sum([
            criterion(preds[:, l], sents[:, l + 1])
            for l in range(preds.size(1))
        ])
        kl = utils.kl_loss_diag(mean, logvar)
        return nll + args.beta * kl

    update_params = list(model.dec.parameters())
    meta_optimizer = OptimN2N(variational_loss,
                              model,
                              update_params,
                              eps=args.eps,
                              lr=[args.svi_lr1, args.svi_lr2],
                              iters=args.svi_steps,
                              momentum=args.momentum,
                              acc_param_grads=args.train_n2n == 1,
                              max_grad_norm=args.svi_max_grad_norm)
    if args.test == 1:
        args.beta = 1
        test_data = Dataset(args.test_file)
        eval(args, test_data, model, meta_optimizer, device)
        exit()

    t = 0
    best_val_nll = 1e5
    best_epoch = 0
    val_stats = []
    epoch = 0
    while epoch < args.num_epochs:
        start_time = time.time()
        epoch += 1
        print('Starting epoch %d' % epoch)
        train_nll_vae = 0.
        train_nll_autoreg = 0.
        train_kl_vae = 0.
        train_nll_svi = 0.
        train_kl_svi = 0.
        train_kl_init_final = 0.
        num_sents = 0
        num_words = 0
        b = 0

        for i in np.random.permutation(len(train_data)):
            if args.warmup > 0:
                args.beta = min(
                    1, args.beta + 1. / (args.warmup * len(train_data)))

            sents, length, batch_size = train_data[i]
            length = length.item()
            batch_size = batch_size.item()

            if args.gpu >= 0:
                # sents = sents.cuda()
                sents = sents.to(device)
                # batch_size = batch_size.to(device)
            b += 1

            optimizer.zero_grad()
            if args.model == 'autoreg':
                preds = model._dec_forward(sents, None, True)
                tgt = sents[:, 1:].contiguous()
                nll_autoreg = criterion(preds.view(-1, preds.size(2)),
                                        tgt.view(-1)).view(preds.size(0),
                                                           -1).sum(-1).mean(0)
                # nll_autoreg = sum([criterion(preds[:, l], sents[:, l+1]) for l in range(length)])
                train_nll_autoreg += nll_autoreg.item() * batch_size
                # train_nll_autoreg += nll_autoreg.data[0]*batch_size #old
                nll_autoreg.backward()
            elif args.model == 'svi':
                # mean_svi = Variable(0.1*torch.zeros(batch_size, args.latent_dim).cuda(), requires_grad = True)
                # logvar_svi = Variable(0.1*torch.zeros(batch_size, args.latent_dim).cuda(), requires_grad = True)
                mean_svi = Variable(
                    0.1 * torch.zeros(batch_size, args.latent_dim).to(device),
                    requires_grad=True)
                logvar_svi = Variable(
                    0.1 * torch.zeros(batch_size, args.latent_dim).to(device),
                    requires_grad=True)
                var_params_svi = meta_optimizer.forward(
                    [mean_svi, logvar_svi], sents, b % args.print_every == 0)
                mean_svi_final, logvar_svi_final = var_params_svi
                z_samples = model._reparameterize(mean_svi_final.detach(),
                                                  logvar_svi_final.detach())
                preds = model._dec_forward(sents, z_samples)
                tgt = sents[:, 1:].contiguous()
                nll_svi = criterion(preds.view(-1, preds.size(2)),
                                    tgt.view(-1)).view(preds.size(0),
                                                       -1).sum(-1).mean(0)
                # nll_svi = sum([criterion(preds[:, l], sents[:, l+1]) for l in range(length)])
                train_nll_svi += nll_svi.data[0] * batch_size
                kl_svi = utils.kl_loss_diag(mean_svi_final, logvar_svi_final)
                train_kl_svi += kl_svi.data[0] * batch_size
                var_loss = nll_svi + args.beta * kl_svi
                var_loss.backward(retain_graph=True)
            else:
                mean, logvar = model._enc_forward(sents)
                z_samples = model._reparameterize(mean, logvar)
                preds = model._dec_forward(sents, z_samples)
                tgt = sents[:, 1:].contiguous()
                nll_vae = criterion(preds.view(-1, preds.size(2)),
                                    tgt.view(-1)).view(preds.size(0),
                                                       -1).sum(-1).mean(0)
                # nll_vae = sum([criterion(preds[:, l], sents[:, l+1]) for l in range(length)])
                # train_nll_vae += nll_vae.data[0]*batch_size#old
                train_nll_vae += nll_vae.item() * batch_size
                kl_vae = utils.kl_loss_diag(mean, logvar)
                # train_kl_vae += kl_vae.data[0]*batch_size#old
                train_kl_vae += kl_vae.item() * batch_size
                if args.model == 'vae':
                    vae_loss = nll_vae + args.beta * kl_vae
                    vae_loss.backward(retain_graph=True)
                if args.model == 'savae':
                    var_params = torch.cat([mean, logvar], 1)
                    mean_svi = Variable(mean.data, requires_grad=True)
                    logvar_svi = Variable(logvar.data, requires_grad=True)
                    var_params_svi = meta_optimizer.forward(
                        [mean_svi, logvar_svi], sents,
                        b % args.print_every == 0)
                    mean_svi_final, logvar_svi_final = var_params_svi
                    z_samples = model._reparameterize(mean_svi_final,
                                                      logvar_svi_final)
                    preds = model._dec_forward(sents, z_samples)
                    tgt = sents[:, 1:].contiguous()
                    nll_svi = criterion(preds.view(-1, preds.size(2)),
                                        tgt.view(-1)).view(preds.size(0),
                                                           -1).sum(-1).mean(0)
                    # nll_svi = sum([criterion(preds[:, l], sents[:, l+1]) for l in range(length)])
                    train_nll_svi += nll_svi.data[0] * batch_size
                    kl_svi = utils.kl_loss_diag(mean_svi_final,
                                                logvar_svi_final)
                    train_kl_svi += kl_svi.data[0] * batch_size
                    var_loss = nll_svi + args.beta * kl_svi
                    var_loss.backward(retain_graph=True)
                    if args.train_n2n == 0:
                        if args.train_kl == 1:
                            mean_final = mean_svi_final.detach()
                            logvar_final = logvar_svi_final.detach()
                            kl_init_final = utils.kl_loss(
                                mean, logvar, mean_final, logvar_final)
                            train_kl_init_final += kl_init_final.data[
                                0] * batch_size
                            kl_init_final.backward(retain_graph=True)
                        else:
                            vae_loss = nll_vae + args.beta * kl_vae
                            var_param_grads = torch.autograd.grad(
                                vae_loss, [mean, logvar], retain_graph=True)
                            var_param_grads = torch.cat(var_param_grads, 1)
                            var_params.backward(var_param_grads,
                                                retain_graph=True)
                    else:
                        var_param_grads = meta_optimizer.backward(
                            [mean_svi_final.grad, logvar_svi_final.grad],
                            b % args.print_every == 0)
                        var_param_grads = torch.cat(var_param_grads, 1)
                        var_params.backward(var_param_grads)
            if args.max_grad_norm > 0:
                torch.nn.utils.clip_grad_norm(model.parameters(),
                                              args.max_grad_norm)
            optimizer.step()
            num_sents += batch_size
            num_words += batch_size * length
            # num_sents = num_sents.item()
            # num_words = num_words.item()
            if b % args.print_every == 0:
                param_norm = sum([p.norm()**2
                                  for p in model.parameters()]).data[0]**0.5
                print(
                    'Iters: %d, Epoch: %d, Batch: %d/%d, LR: %.4f, TrainARPPL: %.2f, TrainVAE_PPL: %.2f, TrainVAE_KL: %.4f, TrainVAE_PPLBnd: %.2f, TrainSVI_PPL: %.2f, TrainSVI_KL: %.4f, TrainSVI_PPLBnd: %.2f, KLInitFinal: %.2f, |Param|: %.4f, BestValPerf: %.2f, BestEpoch: %d, Beta: %.4f, Throughput: %.2f examples/sec'
                    %
                    (t, epoch, b + 1, len(train_data), args.lr,
                     np.exp(train_nll_autoreg / num_words),
                     np.exp(
                         train_nll_vae / num_words), train_kl_vae / num_sents,
                     np.exp((train_nll_vae + train_kl_vae) / num_words),
                     np.exp(
                         train_nll_svi / num_words), train_kl_svi / num_sents,
                     np.exp((train_nll_svi + train_kl_svi) / num_words),
                     train_kl_init_final / num_sents, param_norm, best_val_nll,
                     best_epoch, args.beta, num_sents /
                     (time.time() - start_time)))

        print('--------------------------------')
        print('Checking validation perf...')
        val_nll = eval(args, val_data, model, meta_optimizer, device)
        val_stats.append(val_nll)

        # if val_elbo > self.best_val_elbo:
        #     self.not_improved = 0
        #     self.best_val_elbo = val_elbo
        # else:
        #     self.not_improved += 1
        #     if self.not_improved % 5 == 0:
        #         self.current_lr = self.current_lr * self.config.options.lr_decay
        #         print(f'New LR {self.current_lr}')
        #         model.optimizer = torch.optim.SGD(model.parameters(), lr=self.current_lr)
        #         model.enc_optimizer = torch.optim.SGD(model.parameters(), lr=self.current_lr)
        #         model.dec_optimizer = torch.optim.SGD(model.parameters(), lr=self.current_lr)

        if val_nll < best_val_nll:
            not_improved = 0
            best_save = '{}_{}.pt'.format(args.checkpoint_path, best_val_nll)
            if os.path.exists(best_save):
                os.remove(best_save)

            best_val_nll = val_nll
            best_epoch = epoch
            model.cpu()
            checkpoint = {
                'args': args.__dict__,
                'model': model,
                'val_stats': val_stats
            }
            print('Savaeng checkpoint to %s' % args.checkpoint_path)
            best_save = '{}_{}.pt'.format(args.checkpoint_path, best_val_nll)
            torch.save(checkpoint, best_save)

            # model.cuda()
            model.to(device)
        else:
            not_improved += 1
            if not_improved % 5 == 0:
                not_improved = 0
                args.lr = args.lr * args.lr_decay
                print(f'New LR: {args.lr}')
                for param_group in optimizer.param_groups:
                    param_group['lr'] = args.lr