Exemplo n.º 1
0
def train_model(cuda, vocab_file, data_pkls, save_pretrain_file):
    config = cfg.Config.load("config.json")

    vocab = global_data.load_vocab(vocab_file)

    config.device = torch.device(cuda if torch.cuda.is_available() else "cpu")
    config.n_vocab = len(vocab)
    config.i_pad = global_data.PAD_ID
    config.n_batch = 24
    config.n_epoch = 3
    print(config)

    offset = 0
    model = albert_model.AlBertPretrain(config)
    if os.path.isfile(save_pretrain_file):
        offset = model.bert.load(save_pretrain_file) + 1
        print(">>>> load state dict from: ", save_pretrain_file)
    model.to(config.device)

    train_loader = None

    loss_fn = torch.nn.CrossEntropyLoss(ignore_index=-1)
    optimizer = None
    scheduler = None

    for step in trange(config.n_epoch, desc="Epoch"):
        epoch = step + offset
        if train_loader is not None:
            del train_loader
        data_pkl = data_pkls[epoch % len(data_pkls)]
        print(f"load pretrain data from {data_pkl}")
        train_loader = data.build_pretrain_loader(data_pkl, vocab,
                                                  config.n_batch)
        if optimizer is None or scheduler is None:
            t_total = len(train_loader) * config.n_epoch
            no_decay = ['bias', 'LayerNorm.weight']
            optimizer_grouped_parameters = [{
                'params': [
                    p for n, p in model.named_parameters()
                    if not any(nd in n for nd in no_decay)
                ],
                'weight_decay':
                config.weight_decay
            }, {
                'params': [
                    p for n, p in model.named_parameters()
                    if any(nd in n for nd in no_decay)
                ],
                'weight_decay':
                0.0
            }]
            optimizer = optim.RAdam(optimizer_grouped_parameters,
                                    lr=config.learning_rate,
                                    eps=config.adam_epsilon)
            scheduler = optim.WarmupLinearSchedule(
                optimizer, warmup_steps=config.warmup_steps, t_total=t_total)

        train_epoch(config, epoch, model, loss_fn, optimizer, scheduler,
                    train_loader)
        model.bert.save(epoch, save_pretrain_file)
Exemplo n.º 2
0
def train_model(cuda, vocab_file, data_pkl, save_pretrain_file):
    config = cfg.Config.load("config.json")

    vocab = global_data.load_vocab(vocab_file)
    token_ids = data.load_pretrain(data_pkl)

    config.device = torch.device(cuda if torch.cuda.is_available() else "cpu")
    config.n_vocab = len(vocab)
    config.n_enc_vocab = len(vocab)
    config.n_dec_vocab = len(vocab)
    config.i_pad = global_data.PAD_ID
    config.n_batch = 64
    config.n_epoch = 3
    print(config)
    config.device = torch.device("cpu")

    offset = 0
    model = txl_model.TXLPretrain(config)
    if os.path.isfile(save_pretrain_file):
        offset = model.decoder.load(save_pretrain_file) + 1
        print(">>>> load state dict from: ", save_pretrain_file)
    model.to(config.device)

    train_iter = data.TXLIterator(config, token_ids)

    loss_fn = torch.nn.CrossEntropyLoss(ignore_index=-1)

    t_total = len(train_iter) * config.n_epoch
    no_decay = ['bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [{
        'params': [
            p for n, p in model.named_parameters()
            if not any(nd in n for nd in no_decay)
        ],
        'weight_decay':
        config.weight_decay
    }, {
        'params': [
            p for n, p in model.named_parameters()
            if any(nd in n for nd in no_decay)
        ],
        'weight_decay':
        0.0
    }]
    optimizer = optim.AdamW(optimizer_grouped_parameters,
                            lr=config.learning_rate,
                            eps=config.adam_epsilon)
    scheduler = optim.WarmupLinearSchedule(optimizer,
                                           warmup_steps=config.warmup_steps,
                                           t_total=t_total)

    for step in trange(config.n_epoch, desc="Epoch"):
        epoch = step + offset
        train_epoch(config, epoch, model, loss_fn, optimizer, scheduler,
                    train_iter)
        model.decoder.save(epoch, save_pretrain_file)
Exemplo n.º 3
0
def train_model(cuda, vocab_file, data_pkl, save_file, save_pretrain_file):
    config = cfg.Config.load("config.json")

    vocab = global_data.load_vocab(vocab_file)
    train_label, train_sentence1, train_sentence2, valid_label, valid_sentence1, valid_sentence2, test_label, test_sentence1, test_sentence2 = global_data.load_snli(data_pkl)

    # cuda or cpu
    config.device = torch.device(cuda if torch.cuda.is_available() else "cpu")
    config.n_vocab = len(vocab)
    config.i_pad = global_data.PAD_ID
    print(config)

    best_epoch, best_loss, best_val, best_test = 0, 0, 0, 0
    model = gpt_model.SNLI(config)
    if os.path.isfile(save_file):
        model.load(save_file)
        print(">>>> load state dict from: ", save_file)
    elif os.path.isfile(save_pretrain_file):
        epoch = model.decoder.load(save_pretrain_file)
        print(">>>> load state dict from: ", save_pretrain_file, "epoch:", epoch)
    model.to(config.device)

    train_loader = data.build_data_loader(train_label, train_sentence1, train_sentence2, config.n_batch)
    # train_loader = data.build_data_loader(test_label, test_sentence1, test_sentence2, config.n_batch) ## only for fast test
    valid_loader = data.build_data_loader(valid_label, valid_sentence1, valid_sentence2, config.n_batch)
    test_loader = data.build_data_loader(test_label, test_sentence1, test_sentence2, config.n_batch)

    lm_loss_fn = torch.nn.CrossEntropyLoss(ignore_index=config.i_pad, reduction='mean')
    snli_loss_fn = torch.nn.CrossEntropyLoss(reduction='mean')
    
    t_total = len(train_loader) * config.n_epoch
    no_decay = ['bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [
        {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': config.weight_decay},
        {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
        ]
    optimizer = optim.RAdam(optimizer_grouped_parameters, lr=config.learning_rate, eps=config.adam_epsilon)
    scheduler = optim.WarmupLinearSchedule(optimizer, warmup_steps=config.warmup_steps, t_total=t_total)
    
    best_epoch, best_loss, best_val, best_test = None, None, None, None
    for epoch in trange(config.n_epoch, desc="Epoch"):
        score_loss = train_epoch(config, epoch, model, config.lm_coef, lm_loss_fn, snli_loss_fn, optimizer, scheduler, train_loader)
        score_val = eval_epoch(config, epoch, model, valid_loader, "Valid")
        score_test = eval_epoch(config, epoch, model, test_loader, "Test")

        if best_test is None or best_test < score_test:
            model.save(epoch, score_loss, score_val, score_test, save_file)
            best_epoch, best_loss, best_val, best_test = epoch, score_loss, score_val, score_test
            print(f">>>>>>> model saved at {save_file} {best_epoch} {best_loss:.3f} {best_val:.3f} {best_test:.3f}")
        else:
            print(f">>>>>>> model not seved under accuracy {best_epoch} {best_loss:.3f} {best_val:.3f} {best_test:.3f}")