예제 #1
0
def main():
    # get and process data
    data = utils.MRPCData("./MRPC")
    print("num word: ", data.num_word)
    model = BERT(
        model_dim=MODEL_DIM, max_len=data.max_len, n_layer=N_LAYER, n_head=4, n_vocab=data.num_word,
        lr=LEARNING_RATE, max_seg=data.num_seg, drop_rate=0.1, padding_idx=data.v2i["<PAD>"])
    t0 = time.time()
    arange = np.arange(0, data.max_len)
    for t in range(10000):
        seqs, segs, seqs_, loss_mask, xlen, nsp_labels = random_mask_or_replace(data, arange)
        loss, pred = model.step(seqs, segs, seqs_, loss_mask, nsp_labels)
        if t % 20 == 0:
            pred = pred[0].numpy().argmax(axis=1)
            t1 = time.time()
            print(
                "\n\nstep: ", t,
                "| time: %.2f" % (t1 - t0),
                "| loss: %.3f" % loss.numpy(),
                "\n| tgt: ", " ".join([data.i2v[i] for i in seqs[0][:xlen[0].sum()+1]]),
                "\n| prd: ", " ".join([data.i2v[i] for i in pred[:xlen[0].sum()+1]]),
                "\n| tgt word: ", [data.i2v[i] for i in seqs_[0]*loss_mask[0] if i != data.v2i["<PAD>"]],
                "\n| prd word: ", [data.i2v[i] for i in pred*loss_mask[0] if i != data.v2i["<PAD>"]],
                )
            t0 = t1
    os.makedirs("./visual_helper/bert", exist_ok=True)
    model.save_weights("./visual_helper/bert/model.ckpt")
예제 #2
0
def train():
    MODEL_DIM = 256
    N_LAYER = 4
    LEARNING_RATE = 1e-4
    dataset = utils.MRPCData("./MRPC", 2000)
    print("num word: ", dataset.num_word)
    model = GPT(model_dim=MODEL_DIM,
                max_len=dataset.max_len - 1,
                num_layer=N_LAYER,
                num_head=4,
                n_vocab=dataset.num_word,
                lr=LEARNING_RATE,
                max_seg=dataset.num_seg,
                drop_rate=0.2,
                padding_idx=dataset.pad_id)
    if torch.cuda.is_available():
        print("GPU train avaliable")
        device = torch.device("cuda")
        model = model.cuda()
    else:
        device = torch.device("cpu")
        model = model.cpu()

    loader = DataLoader(dataset, batch_size=32, shuffle=True)

    for epoch in range(100):
        for batch_idx, batch in enumerate(loader):
            seqs, segs, xlen, nsp_labels = batch
            seqs, segs, nsp_labels = seqs.type(
                torch.LongTensor).to(device), segs.type(
                    torch.LongTensor).to(device), nsp_labels.to(device)
            # pred: [n, step, n_vocab]
            loss, pred = model.step(seqs=seqs[:, :-1],
                                    segs=segs[:, :-1],
                                    seqs_=seqs[:, 1:],
                                    nsp_labels=nsp_labels)
            if batch_idx % 100 == 0:
                pred = pred[0].cpu().data.numpy().argmax(axis=1)  # [step]
                print(
                    "Epoch: ",
                    epoch,
                    "|batch: ",
                    batch_idx,
                    "| loss: %.3f" % loss,
                    "\n| tgt: ",
                    " ".join([
                        dataset.i2v[i]
                        for i in seqs[0,
                                      1:].cpu().data.numpy()[:xlen[0].sum() +
                                                             1]
                    ]),
                    "\n| prd: ",
                    " ".join(
                        [dataset.i2v[i] for i in pred[:xlen[0].sum() + 1]]),
                )
    os.makedirs("./visual/models/gpt", exist_ok=True)
    torch.save(model.state_dict(), "./visual/models/gpt/model.pth")
    export_attention(model, device, dataset)
예제 #3
0
def export_attention():
    data = utils.MRPCData("./MRPC")
    print("num word: ", data.num_word)
    model = BERT(
        model_dim=MODEL_DIM, max_len=data.max_len, n_layer=N_LAYER, n_head=4, n_vocab=data.num_word,
        lr=LEARNING_RATE, max_seg=data.num_seg, drop_rate=0.1, padding_idx=data.v2i["<PAD>"])
    model.load_weights("./visual_helper/bert/model.ckpt")

    # save attention matrix for visualization
    seqs, segs, xlen, nsp_labels = data.sample(1)
    model(seqs, segs, False)
    data = {"src": [data.i2v[i] for i in seqs[0]], "attentions": model.attentions}
    with open("./visual_helper/bert_attention_matrix.pkl", "wb") as f:
        pickle.dump(data, f)
예제 #4
0
    # save attention matrix for visualization
    seqs, segs, xlen, nsp_labels = data.sample(32)
    model.call(seqs[:, :-1], segs[:, :-1], False)
    data = {
        "src": [[data.i2v[i] for i in seqs[j]] for j in range(len(seqs))],
        "attentions": model.attentions
    }
    with open("./visual/tmp/%s_attention_matrix.pkl" % name, "wb") as f:
        pickle.dump(data, f)


if __name__ == "__main__":
    MODEL_DIM = 256
    N_LAYER = 4
    LEARNING_RATE = 1e-4

    d = utils.MRPCData("./MRPC", 2000)
    print("num word: ", d.num_word)
    m = GPT(model_dim=MODEL_DIM,
            max_len=d.max_len - 1,
            n_layer=N_LAYER,
            n_head=4,
            n_vocab=d.num_word,
            lr=LEARNING_RATE,
            max_seg=d.num_seg,
            drop_rate=0.2,
            padding_idx=d.pad_id)
    train(m, d, step=5000, name="gpt")
    export_attention(m, d, name="gpt")
예제 #5
0
def train():
    MODEL_DIM = 256
    N_LAYER = 4
    LEARNING_RATE = 1e-4
    dataset = utils.MRPCData("./MRPC", 2000)
    print("num word: ", dataset.num_word)
    model = BERT(model_dim=MODEL_DIM,
                 max_len=dataset.max_len,
                 num_layer=N_LAYER,
                 num_head=4,
                 n_vocab=dataset.num_word,
                 lr=LEARNING_RATE,
                 max_seg=dataset.num_seg,
                 drop_rate=0.2,
                 padding_idx=dataset.pad_id)
    if torch.cuda.is_available():
        print("GPU train avaliable")
        device = torch.device("cuda")
        model = model.cuda()
    else:
        device = torch.device("cpu")
        model = model.cpu()

    loader = DataLoader(dataset, batch_size=32, shuffle=True)
    arange = np.arange(0, dataset.max_len)
    for epoch in range(500):
        for batch_idx, batch in enumerate(loader):
            seqs, segs, seqs_, loss_mask, xlen, nsp_labels = random_mask_or_replace(
                batch, arange, dataset)
            seqs, segs, seqs_, nsp_labels, loss_mask = seqs.type(
                torch.LongTensor).to(device), segs.type(
                    torch.LongTensor).to(device), seqs_.type(
                        torch.LongTensor).to(device), nsp_labels.to(
                            device), loss_mask.to(device)
            loss, pred = model.step(seqs, segs, seqs_, loss_mask, nsp_labels)
            if batch_idx % 100 == 0:
                pred = pred[0].cpu().data.numpy().argmax(axis=1)
                print(
                    "\n\nEpoch: ",
                    epoch,
                    "|batch: ",
                    batch_idx,
                    "| loss: %.3f" % loss,
                    "\n| tgt: ",
                    " ".join([
                        dataset.i2v[i]
                        for i in seqs[0].cpu().data.numpy()[:xlen[0].sum() + 1]
                    ]),
                    "\n| prd: ",
                    " ".join(
                        [dataset.i2v[i] for i in pred[:xlen[0].sum() + 1]]),
                    "\n| tgt word: ",
                    [
                        dataset.i2v[i]
                        for i in (seqs_[0] *
                                  loss_mask[0].view(-1)).cpu().data.numpy()
                        if i != dataset.v2i["<PAD>"]
                    ],
                    "\n| prd word: ",
                    [
                        dataset.i2v[i] for i in pred *
                        (loss_mask[0].view(-1).cpu().data.numpy())
                        if i != dataset.v2i["<PAD>"]
                    ],
                )
    os.makedirs("./visual/models/bert", exist_ok=True)
    torch.save(model.state_dict(), "./visual/models/bert/model.pth")
    export_attention(model, device, dataset)