示例#1
0
def train_model(cuda, vocab_file, data_pkls, save_pretrain_file):
    config = cfg.Config.load("config.json")

    vocab = global_data.load_vocab(vocab_file)

    config.device = torch.device(cuda if torch.cuda.is_available() else "cpu")
    config.n_vocab = len(vocab)
    config.i_pad = global_data.PAD_ID
    config.n_batch = 24
    config.n_epoch = 3
    print(config)

    offset = 0
    model = albert_model.AlBertPretrain(config)
    if os.path.isfile(save_pretrain_file):
        offset = model.bert.load(save_pretrain_file) + 1
        print(">>>> load state dict from: ", save_pretrain_file)
    model.to(config.device)

    train_loader = None

    loss_fn = torch.nn.CrossEntropyLoss(ignore_index=-1)
    optimizer = None
    scheduler = None

    for step in trange(config.n_epoch, desc="Epoch"):
        epoch = step + offset
        if train_loader is not None:
            del train_loader
        data_pkl = data_pkls[epoch % len(data_pkls)]
        print(f"load pretrain data from {data_pkl}")
        train_loader = data.build_pretrain_loader(data_pkl, vocab,
                                                  config.n_batch)
        if optimizer is None or scheduler is None:
            t_total = len(train_loader) * config.n_epoch
            no_decay = ['bias', 'LayerNorm.weight']
            optimizer_grouped_parameters = [{
                'params': [
                    p for n, p in model.named_parameters()
                    if not any(nd in n for nd in no_decay)
                ],
                'weight_decay':
                config.weight_decay
            }, {
                'params': [
                    p for n, p in model.named_parameters()
                    if any(nd in n for nd in no_decay)
                ],
                'weight_decay':
                0.0
            }]
            optimizer = optim.RAdam(optimizer_grouped_parameters,
                                    lr=config.learning_rate,
                                    eps=config.adam_epsilon)
            scheduler = optim.WarmupLinearSchedule(
                optimizer, warmup_steps=config.warmup_steps, t_total=t_total)

        train_epoch(config, epoch, model, loss_fn, optimizer, scheduler,
                    train_loader)
        model.bert.save(epoch, save_pretrain_file)
示例#2
0
def train_model(cuda, vocab_file, data_pkl, save_pretrain_file):
    config = cfg.Config.load("config.json")

    vocab = global_data.load_vocab(vocab_file)
    token_ids = data.load_pretrain(data_pkl)

    config.device = torch.device(cuda if torch.cuda.is_available() else "cpu")
    config.n_vocab = len(vocab)
    config.n_enc_vocab = len(vocab)
    config.n_dec_vocab = len(vocab)
    config.i_pad = global_data.PAD_ID
    config.n_batch = 64
    config.n_epoch = 3
    print(config)
    config.device = torch.device("cpu")

    offset = 0
    model = txl_model.TXLPretrain(config)
    if os.path.isfile(save_pretrain_file):
        offset = model.decoder.load(save_pretrain_file) + 1
        print(">>>> load state dict from: ", save_pretrain_file)
    model.to(config.device)

    train_iter = data.TXLIterator(config, token_ids)

    loss_fn = torch.nn.CrossEntropyLoss(ignore_index=-1)

    t_total = len(train_iter) * config.n_epoch
    no_decay = ['bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [{
        'params': [
            p for n, p in model.named_parameters()
            if not any(nd in n for nd in no_decay)
        ],
        'weight_decay':
        config.weight_decay
    }, {
        'params': [
            p for n, p in model.named_parameters()
            if any(nd in n for nd in no_decay)
        ],
        'weight_decay':
        0.0
    }]
    optimizer = optim.AdamW(optimizer_grouped_parameters,
                            lr=config.learning_rate,
                            eps=config.adam_epsilon)
    scheduler = optim.WarmupLinearSchedule(optimizer,
                                           warmup_steps=config.warmup_steps,
                                           t_total=t_total)

    for step in trange(config.n_epoch, desc="Epoch"):
        epoch = step + offset
        train_epoch(config, epoch, model, loss_fn, optimizer, scheduler,
                    train_iter)
        model.decoder.save(epoch, save_pretrain_file)
示例#3
0
def demp_pretrain(vocab_file, corpus, file):
    args = cfg.Config({
        "max_seq_len": 512,
        "short_seq_prob": 0.1,
        "masked_lm_prob": 0.15,
        "max_predictions_per_seq": 20,
    })
    vocab = global_data.load_vocab(vocab_file)

    print(f"read {corpus}, write {file}")
    docs = []
    with open(corpus) as f:
        doc = []
        for line in tqdm(f, desc="Loading Dataset", unit=" lines"):
            line = line.strip()
            if line == "":
                if doc:
                    docs.append(doc)
                    doc = []
            else:
                tokens = vocab.encode_as_ids(line.lower())
                if tokens:
                    doc.append(tokens)
        if doc:
            docs.append(doc)
    if len(docs) <= 1:
        exit("ERROR: more documnet need")

    vocab_list = []
    for id in range(vocab.get_piece_size()):
        if not vocab.is_unknown(id):
            vocab_list.append(id)

    with open(file, "w") as f:
        with tqdm(total=len(docs), desc=f"Document") as pbar:
            for doc_idx in range(len(docs)):
                timestamp1 = time.time()
                doc_instances = create_instances_from_document(
                    docs,
                    doc_idx,
                    max_seq_length=args.max_seq_len,
                    short_seq_prob=args.short_seq_prob,
                    masked_lm_prob=args.masked_lm_prob,
                    max_predictions_per_seq=args.max_predictions_per_seq,
                    vocab=vocab,
                    vocab_list=vocab_list)
                for instance in doc_instances:
                    f.write(json.dumps(instance))
                    f.write("\n")
                timestamp2 = time.time()
                if 60 < (timestamp2 - timestamp1):
                    print(
                        f">>>> {(timestamp2 - timestamp1)}: {len(doc_instances)}"
                    )
                pbar.update(1)
                pbar.set_postfix_str(f"Instances: {len(doc_instances)}")
示例#4
0
def train_model(cuda, vocab_file, data_pkl, save_file, save_pretrain_file):
    config = cfg.Config.load("config.json")

    vocab = global_data.load_vocab(vocab_file)
    train_label, train_sentence1, train_sentence2, valid_label, valid_sentence1, valid_sentence2, test_label, test_sentence1, test_sentence2 = global_data.load_snli(data_pkl)

    # cuda or cpu
    config.device = torch.device(cuda if torch.cuda.is_available() else "cpu")
    config.n_vocab = len(vocab)
    config.i_pad = global_data.PAD_ID
    print(config)

    best_epoch, best_loss, best_val, best_test = 0, 0, 0, 0
    model = gpt_model.SNLI(config)
    if os.path.isfile(save_file):
        model.load(save_file)
        print(">>>> load state dict from: ", save_file)
    elif os.path.isfile(save_pretrain_file):
        epoch = model.decoder.load(save_pretrain_file)
        print(">>>> load state dict from: ", save_pretrain_file, "epoch:", epoch)
    model.to(config.device)

    train_loader = data.build_data_loader(train_label, train_sentence1, train_sentence2, config.n_batch)
    # train_loader = data.build_data_loader(test_label, test_sentence1, test_sentence2, config.n_batch) ## only for fast test
    valid_loader = data.build_data_loader(valid_label, valid_sentence1, valid_sentence2, config.n_batch)
    test_loader = data.build_data_loader(test_label, test_sentence1, test_sentence2, config.n_batch)

    lm_loss_fn = torch.nn.CrossEntropyLoss(ignore_index=config.i_pad, reduction='mean')
    snli_loss_fn = torch.nn.CrossEntropyLoss(reduction='mean')
    
    t_total = len(train_loader) * config.n_epoch
    no_decay = ['bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [
        {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': config.weight_decay},
        {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
        ]
    optimizer = optim.RAdam(optimizer_grouped_parameters, lr=config.learning_rate, eps=config.adam_epsilon)
    scheduler = optim.WarmupLinearSchedule(optimizer, warmup_steps=config.warmup_steps, t_total=t_total)
    
    best_epoch, best_loss, best_val, best_test = None, None, None, None
    for epoch in trange(config.n_epoch, desc="Epoch"):
        score_loss = train_epoch(config, epoch, model, config.lm_coef, lm_loss_fn, snli_loss_fn, optimizer, scheduler, train_loader)
        score_val = eval_epoch(config, epoch, model, valid_loader, "Valid")
        score_test = eval_epoch(config, epoch, model, test_loader, "Test")

        if best_test is None or best_test < score_test:
            model.save(epoch, score_loss, score_val, score_test, save_file)
            best_epoch, best_loss, best_val, best_test = epoch, score_loss, score_val, score_test
            print(f">>>>>>> model saved at {save_file} {best_epoch} {best_loss:.3f} {best_val:.3f} {best_test:.3f}")
        else:
            print(f">>>>>>> model not seved under accuracy {best_epoch} {best_loss:.3f} {best_val:.3f} {best_test:.3f}")
示例#5
0
def demp_pretrain(vocab_file, file):
    in_file = f"../data/corpus.book.middle.txt"

    vocab = global_data.load_vocab(vocab_file)

    features = _create_data(vocab=vocab,
                            filename=in_file,
                            seq_len=256,
                            reuse_len=128,
                            num_predict=43,
                            mask_alpha=6,
                            mask_beta=1,
                            perm_size=128)

    with open(file, 'wb') as f:
        pickle.dump((features), f)
示例#6
0
def demp_pretrain(vocab_file, file):
    in_file = f"../data/corpus.book.large.txt"

    vocab = global_data.load_vocab(vocab_file)

    token_ids = []
    with open(in_file) as f:
        for line in tqdm(f, desc="Loading Dataset", unit=" lines"):
            line = line.strip()
            if line == "":
                pass
            else:
                token_ids.extend(vocab.encode_as_ids(line.lower()))
    token_ids = np.array(token_ids)

    with open(file, 'wb') as f:
        pickle.dump((token_ids), f)
示例#7
0
def demp_pretrain(vocab_file, file):
    args = cfg.Config({
        "reduce_memory": False,
        "train_corpus": Path("../data/corpus.book.large.txt"),
        "output_dir": Path("data"),
        "max_seq_len": 256,
        "short_seq_prob": 0.1,
        "masked_lm_prob": 0.15,
        "max_predictions_per_seq": 20,
        "do_whole_word_mask": True,
        "save_filename": file,
    })

    vocab = global_data.load_vocab(vocab_file)

    with DocumentDatabase(reduce_memory=args.reduce_memory) as docs:
        with args.train_corpus.open() as f:
            doc = []
            for line in tqdm(f, desc="Loading Dataset", unit=" lines"):
                line = line.strip()
                if line == "":
                    docs.add_document(doc)
                    doc = []
                else:
                    tokens = vocab.encode_as_pieces(line.lower())
                    doc.append(tokens)
            if doc:
                docs.add_document(
                    doc
                )  # If the last doc didn't end on a newline, make sure it still gets added
        if len(docs) <= 1:
            exit(
                "ERROR: No document breaks were found in the input file! These are necessary to allow the script to "
                "ensure that random NextSentences are not sampled from the same document. Please add blank lines to "
                "indicate breaks between documents in your input file. If your dataset does not contain multiple "
                "documents, blank lines can be inserted at any natural boundary, such as the ends of chapters, "
                "sections or paragraphs.")

    vocab_list = []
    for id in range(vocab.get_piece_size()):
        if not vocab.is_unknown(id):
            vocab_list.append(vocab.id_to_piece(id))
    create_training_file(docs, vocab_list, args)
            torch.zeros([pad_len], dtype=torch.float32)
        ],
                                dim=0)
        feature["target_mask"] = torch.reshape(target_mask, [num_predict])
    else:
        feature["target"] = torch.reshape(target, [seq_len])
        feature["target_mask"] = torch.reshape(target_mask, [seq_len])

    # reshape back to fixed shape
    # (seq,)
    feature["seg_id"] = torch.IntTensor(feature["seg_id"])
    # (seq, seq)
    feature["perm_mask"] = torch.reshape(perm_mask, [seq_len, seq_len])
    # (seq,)
    feature["input_k"] = torch.reshape(input_k, [seq_len])
    # (seq,)
    feature["input_q"] = torch.reshape(input_q, [seq_len])

    return feature


if __name__ == '__main__':
    vocab = global_data.load_vocab("../data/m_snli_8000.model")
    _create_data(sp=vocab,
                 input_paths="data.txt",
                 seq_len=512,
                 reuse_len=256,
                 bi_data=False,
                 num_predict=85,
                 mask_alpha=6,
                 mask_beta=1)