Exemplo n.º 1
0
        agent.load_state_dict(checkpoint['agent'])
        start_epoch = checkpoint['epoch'] + 1
        print('loaded agent from', args.load)

    if args.parallel:
        agent = nn.DataParallel(agent)
        seg_model = nn.DataParallel(seg_model)

    seg_model.eval().cuda()
    agent.cuda()

    optimizer = optim.Adam(agent.parameters(),
                           lr=args.lr,
                           weight_decay=args.wd)

    lr_scheduler = utils.LrScheduler(optimizer, args.lr, args.lr_decay_ratio,
                                     args.epoch_step)
    for epoch in range(start_epoch, start_epoch + args.max_epochs + 1):
        lr_scheduler.adjust_learning_rate(epoch)

        if args.cl_step < num_blocks:
            args.cl_step = 1 + 1 * (epoch // 10)
        else:
            args.cl_step = num_blocks

        print('training the last %d blocks ...' % args.cl_step)
        train(epoch)

        with torch.no_grad():
            if epoch != 0 and epoch % 10 == 0:
                test(epoch)
Exemplo n.º 2
0
    vocab_size = args.max_vocab_size

model = BiLSTM(vocab_size=vocab_size,
               embed_dim=args.embed_dim,
               hidden_dim=args.hidden_dim,
               num_tags=len(tag2idx),
               embed_matrix=embed_mat)

loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(
    from_logits=True)  # Set "from_logits=True" may be more numerically stable

#%% Degine optimizer
total_steps = len(list(train_batches)) * args.epochs
warm_steps = int(total_steps * args.warm_frac)

lr_scheduler = utils.LrScheduler(args.lr, warm_steps, total_steps)
wd_scheduler = utils.WdScheduler(1e-2, warm_steps, total_steps)

## Adam optimizer
# optimizer = tf.keras.optimizers.Adam(learning_rate = lr_scheduler)

## AdamW optimizer
optimizer = tfa.optimizers.AdamW(learning_rate=lr_scheduler,
                                 weight_decay=lambda: None)
optimizer.weight_decay = lambda: wd_scheduler(optimizer.iterations)


#%%
@tf.function(
    experimental_relax_shapes=True
)  # Passing tensors with different shapes - need to relax shapes to avoid unnecessary retracing