Exemplo n.º 1
0
def train(rank, args):
    print(f"Running basic DDP example on rank {rank} {args.master_port}.")
    setup(rank, args.world_size, args.master_port)
    args.local_rank = rank
    torch.manual_seed(args.seed)
    torch.cuda.set_device(rank)
    src_vocab = Dictionary.read_vocab(args.vocab_src)
    tgt_vocab = Dictionary.read_vocab(args.vocab_tgt)
    batch_size = args.batch_size

    # model init
    model = TransformerModel(d_model=args.d_model,
                             nhead=args.nhead,
                             num_encoder_layers=args.num_encoder_layers,
                             num_decoder_layers=args.num_decoder_layers,
                             dropout=args.dropout,
                             attention_dropout=args.attn_dropout,
                             src_dictionary=src_vocab,
                             tgt_dictionary=tgt_vocab)
    model.to(rank)
    model = DDP(model, device_ids=[rank])

    if rank == 0:
        print(model)
    print('num. model params: {} (num. trained: {})'.format(
        sum(p.numel() for p in model.parameters()),
        sum(p.numel() for p in model.parameters() if p.requires_grad),
    ))

    # data load
    train_loader = dataloader.get_train_parallel_loader(args.train_src,
                                                        args.train_tgt,
                                                        src_vocab,
                                                        tgt_vocab,
                                                        batch_size=batch_size)
    valid_loader = dataloader.get_valid_parallel_loader(args.valid_src,
                                                        args.valid_tgt,
                                                        src_vocab,
                                                        tgt_vocab,
                                                        batch_size=batch_size)

    data = {'dataloader': {'train': train_loader, 'valid': valid_loader}}

    trainer = Trainer(model, data, args)
    for epoch in range(1, args.max_epoch):
        trainer.mt_step(epoch)
        trainer.evaluate(epoch)
        trainer.save_checkpoint(epoch)
Exemplo n.º 2
0
 def on_init(self, params, p_params):
     dump_path = os.path.join(params.dump_path, "debias")
     checkpoint_path = os.path.join(dump_path, "checkpoint.pth")
     if os.path.isfile(checkpoint_path):
         self.params.dump_path = dump_path
         self.checkpoint_path = checkpoint_path
         self.from_deb = True
     else:
         self.checkpoint_path = os.path.join(params.dump_path,
                                             "checkpoint.pth")
         self.from_deb = False
     deb = TransformerModel(p_params,
                            self.model.dico,
                            is_encoder=True,
                            with_output=False,
                            with_emb=False)
     #deb = LinearDeb(p_params)
     self.deb = deb.to(params.device)