示例#1
0
def main():
    args = parse_args()
    name = construct_name(
        args.exp_name,
        args.lr,
        args.batch_size,
        args.num_epochs,
        args.weight_decay,
        args.optimizer,
        args.iter_per_step,
    )
    log_dir = name
    if args.work_dir:
        log_dir = os.path.join(args.work_dir, name)

    # instantiate Neural Factory with supported backend
    neural_factory = nemo.core.NeuralModuleFactory(
        backend=nemo.core.Backend.PyTorch,
        local_rank=args.local_rank,
        optimization_level=args.amp_opt_level,
        log_dir=log_dir,
        checkpoint_dir=args.checkpoint_dir,
        create_tb_writer=args.create_tb_writer,
        files_to_copy=[args.model_config, __file__],
        cudnn_benchmark=args.cudnn_benchmark,
        tensorboard_dir=args.tensorboard_dir,
    )
    args.num_gpus = neural_factory.world_size

    checkpoint_dir = neural_factory.checkpoint_dir
    if args.local_rank is not None:
        nemo.logging.info('Doing ALL GPU')

    # build dags
    train_loss, callbacks, steps_per_epoch = create_all_dags(
        args, neural_factory)

    # train model
    neural_factory.train(
        tensors_to_optimize=[train_loss],
        callbacks=callbacks,
        lr_policy=SquareAnnealing(args.num_epochs * steps_per_epoch,
                                  warmup_steps=args.warmup_steps),
        optimizer=args.optimizer,
        optimization_params={
            "num_epochs": args.num_epochs,
            "lr": args.lr,
            "betas": (args.beta1, args.beta2),
            "weight_decay": args.weight_decay,
            "grad_norm_clip": None,
        },
        batches_per_step=args.iter_per_step,
    )
示例#2
0
callback = nemo.core.SimpleLossLoggerCallback(
    step_freq=50,
    tb_writer=tb_writer,
    tensor_list2str=lambda x: str(x[0].item()),
    tensor_list2str_evl=lambda x: compute_accuracy(x))

callback_eval = nemo.core.EvaluatorCallback(
    eval_tensors=[e_loss, e_outputs, e_labels],
    user_iter_callback=eval_iter_callback,
    user_epochs_done_callback=eval_epochs_done_callback,
    eval_step=10000,
    tb_writer=tb_writer)

# Instantiate an optimizer to perform `train` action
optimizer = neural_factory.get_trainer(
    params={
        "optimization_params": {
            "num_epochs": num_epochs,
            "lr": learning_rate,
            "max_steps": max_steps,
            "weight_decay": weight_decay,
            "momentum": momentum
        }
    })

optimizer.train(tensors_to_optimize=[train_loss],
                tensors_to_evaluate=[outputs, labels],
                callbacks=[callback, callback_eval],
                lr_policy=SquareAnnealing(num_epochs * step_per_epoch))
示例#3
0
    callback = nemo.core.EvaluatorCallback(
        eval_tensors=all_eval_tensors[eval_dataset],
        user_iter_callback=lambda x, y: eval_iter_callback(x, y, tokenizer),
        user_epochs_done_callback=eval_epochs_done_callback_wer,
        eval_step=args.eval_freq,
        tb_writer=nf.tb_writer,
    )
    callbacks.append(callback)

checkpointer_callback = CheckpointCallback(folder=args.work_dir,
                                           step_freq=args.checkpoint_save_freq)
callbacks.append(checkpointer_callback)

# define learning rate decay policy
lr_policy = SquareAnnealing(total_steps=args.max_steps,
                            min_lr=1e-5,
                            warmup_steps=args.warmup_steps)

# Create trainer and execute training action
nf.train(
    tensors_to_optimize=[train_loss],
    callbacks=callbacks,
    optimizer=args.optimizer,
    lr_policy=lr_policy,
    optimization_params={
        "num_epochs": 300,
        "max_steps": args.max_steps,
        "lr": args.lr,
        "weight_decay": args.weight_decay,
    },
    batches_per_step=args.iter_per_step,
示例#4
0
def main():
    # Parse args
    args = parse_args()
    cfg = parse_cfg(args)
    name = construct_name(args, cfg)

    # instantiate Neural Factory with supported backend
    neural_factory = nemo.core.NeuralModuleFactory(
        backend=nemo.core.Backend.PyTorch,
        local_rank=args.local_rank,
        optimization_level=args.amp_opt_level,
        log_dir=name,
        checkpoint_dir=args.checkpoint_dir,
        create_tb_writer=args.create_tb_writer,
        files_to_copy=[args.model_config, __file__],
        cudnn_benchmark=args.cudnn_benchmark,
        tensorboard_dir=args.tensorboard_dir)

    logger = neural_factory.logger
    tb_writer = neural_factory.tb_writer
    args.checkpoint_dir = neural_factory.checkpoint_dir

    logger.info(f'Name:\n{name}')
    logger.info(f'Args to be passed to job #{args.local_rank}:')
    logger.info(pformat(vars(args)))

    if args.random_seed is not None:
        random.seed(args.random_seed)
        np.random.seed(args.random_seed)
        torch.manual_seed(args.random_seed)
        logger.info(f'Using seed {args.random_seed}')

    # Defining computational graph
    (train_loss, evals), cfg, dag_callbacks = create_dag(
        args, cfg, neural_factory.world_size)
    logger.info('Config:')
    logger.info(pformat(cfg))

    num_data = cfg['input']['train']['num_data']
    steps_per_epoch = cfg['optimization']['steps_per_epoch']
    total_steps = cfg['optimization']['total_steps']
    logger.info(f'Num data: {num_data}\n'
                f'Steps per epoch: {steps_per_epoch}\n'
                f'Total steps: {total_steps}')

    dag_callbacks[0].tb_writer = tb_writer

    # Callbacks
    train_callback = nemo.core.SimpleLossLoggerCallback(
        tensors=[train_loss],
        print_func=lambda x: logger.info(f"Loss: {x[0].item()}"),
        get_tb_values=lambda x: [("loss", x[0])],
        tb_writer=tb_writer
    )
    log_callbacks = [train_callback]
    target = cfg['target']
    labels = target['labels']
    specials = {f'{ss.name}_id': target[f'{ss.name}_id'] for ss in sss}
    for name, tensors in evals:
        eval_callback = nemo.core.EvaluatorCallback(
            # TODO: Should be fixed soon, so we don't need to pass exactly list
            eval_tensors=list(tensors),
            user_iter_callback=partial(
                process_evaluation_batch,
                labels=labels,
                specials=specials,
                write_attn=False
            ),
            user_epochs_done_callback=partial(
                process_evaluation_epoch,
                tag=os.path.basename(name),
                logger=logger
            ),
            eval_step=args.eval_freq,
            tb_writer=tb_writer
        )
        log_callbacks.append(eval_callback)
    # noinspection PyTypeChecker
    callbacks = log_callbacks + dag_callbacks

    # Optimize
    neural_factory.train(
        tensors_to_optimize=[train_loss],
        callbacks=callbacks,
        lr_policy=SquareAnnealing(
            cfg['optimization']['total_steps'],
            min_lr=cfg['optimization']['min_lr'],
            warmup_steps=(
                    cfg['optimization']['warmup_epochs']
                    * cfg['optimization']['steps_per_epoch']
            )
        ),
        optimizer=cfg['optimization']['optimizer'],
        optimization_params=cfg['optimization']['params'],
        batches_per_step=args.iter_per_step
    )
示例#5
0
 def test_square(self):
     policy = SquareAnnealing(100)
     lr1, lr2, lr3 = (policy(1e-3, x, 0) for x in (0, 10, 20))
     self.assertTrue(lr1 >= lr2)
     self.assertTrue(lr2 >= lr3)
     self.assertTrue(lr1 - lr2 >= lr2 - lr3)
示例#6
0
 def test_warmup(self):
     policy = SquareAnnealing(100, warmup_ratio=0.5)
     lr1, lr2, lr3 = (policy(1e-3, x, 0) for x in (0, 50, 100))
     self.assertTrue(lr1 < lr2)
     self.assertTrue(lr2 > lr3)
示例#7
0
train_data_size = len(train_data_layer)
steps_per_epoch = int(train_data_size /
                      (args.batch_size * args.num_gpus * args.batch_per_step))

callback_dev = nemo.core.EvaluatorCallback(
    # eval_tensors=[dev_mlm_loss, dev_nsp_loss],
    eval_tensors=[dev_mlm_loss],
    user_iter_callback=eval_iter_callback,
    user_epochs_done_callback=eval_epochs_done_callback,
    eval_step=steps_per_epoch,
    tb_writer=tb_writer)

# define learning rate decay policy
if args.lr_decay_policy == "poly":
    lr_policy = SquareAnnealing(args.num_epochs * steps_per_epoch,
                                warmup_ratio=args.lr_warmup_proportion)
elif args.lr_decay_policy == "cosine":
    lr_policy = CosineAnnealing(args.num_epochs * steps_per_epoch,
                                warmup_ratio=args.lr_warmup_proportion)
elif args.lr_decay_policy == "noam":
    lr_policy = \
        InverseSquareRootAnnealing(args.num_epochs * steps_per_epoch,
                                   warmup_ratio=args.lr_warmup_proportion)
else:
    raise NotImplementedError

# save config file
if not os.path.exists(args.checkpoint_directory):
    os.makedirs(args.checkpoint_directory)

config_path = os.path.join(args.checkpoint_directory, "bert-config.json")
示例#8
0
callback = nemo.core.SimpleLossLoggerCallback(
    tensor_list2str=lambda x: str(x[0].item()),
    tb_writer=tb_writer,
    step_freq=100)
# callback which calculates evaluation loss without label smoothing
# and BLEU scores between outputs of beam search and reference translations
callback_dev = nemo.core.EvaluatorCallback(
    eval_tensors=[eval_loss],
    user_iter_callback=eval_iter_callback,
    user_epochs_done_callback=eval_epochs_done_callback,
    eval_step=args.eval_step_frequency,
    tb_writer=tb_writer)

# define learning rate decay policy
if args.lr_decay_policy == "poly":
    lr_policy = SquareAnnealing(args.max_num_steps,
                                warmup_steps=args.warmup_steps)
elif args.lr_decay_policy == "cosine":
    lr_policy = CosineAnnealing(args.max_num_steps,
                                warmup_steps=args.warmup_steps)
elif args.lr_decay_policy == "noam":
    lr_policy = InverseSquareRootAnnealing(args.max_num_steps,
                                           warmup_steps=args.warmup_steps)
else:
    raise NotImplementedError

# define and launch training algorithm (optimizer)
optimizer = neural_factory.get_trainer(
    params={
        "optimizer_kind": args.optimizer,
        "optimization_params": {
            "num_epochs": args.max_num_epochs,
示例#9
0
        }
    })

train_data_size = len(train_data_layer)
steps_per_epoch = int(train_data_size / (args.batch_size * args.num_gpus))

print("steps_per_epoch =", steps_per_epoch)

callback_eval = nemo.core.EvaluatorCallback(
    eval_tensors=[eval_logits, eval_seq_ids],
    user_iter_callback=lambda x, y: eval_iter_callback(x, y, eval_data_layer,
                                                       tag_ids),
    user_epochs_done_callback=lambda x: eval_epochs_done_callback(
        x, tag_ids, args.output_filename),
    tb_writer=tb_writer,
    eval_step=steps_per_epoch)

if args.lr_policy == "lr_warmup":
    lr_policy_func = WarmupAnnealing(args.num_epochs * steps_per_epoch,
                                     warmup_ratio=args.lr_warmup_proportion)
elif args.lr_policy == "lr_poly":
    lr_policy_func = SquareAnnealing(args.num_epochs * steps_per_epoch)
elif args.lr_policy == "lr_cosine":
    lr_policy_func = CosineAnnealing(args.num_epochs * steps_per_epoch)
else:
    raise ValueError("Invalid lr_policy, must be lr_warmup or lr_poly")

optimizer.train(tensors_to_optimize=[train_loss],
                callbacks=[callback_train, callback_eval],
                lr_policy=lr_policy_func)