예제 #1
0
파일: train.py 프로젝트: lionsterben/NLP
def get_prediction(source_path, target_path, model):
    # use beam search
    preds = []
    targets = []
    i = 0
    batch_size = 32
    max_len = 50
    base_dir = "/home/FuDawei/NLP/Machine_Translation/dataset/en-fr-no/"
    source_lines, source_word2id, source_id2word = get_origin_data(
        source_path, base_dir + "fr_word2id.json",
        base_dir + "fr_id2word.json")
    target_lines, target_word2id, target_id2word = get_origin_data(
        target_path, base_dir + "en_word2id.json",
        base_dir + "en_id2word.json")
    for start_index in range(0, len(source_lines), 32):
        source_input, _, source_input_token, _ = generate_batch(
            source_lines, source_word2id, source_id2word, 32, start_index,
            True, True, max_len)
        target_input, target_output, target_input_token, target_output_token = generate_batch(
            target_lines, target_word2id, target_id2word, 32, start_index,
            True, True, max_len)
        source_input = torch.tensor(source_input).to(device)
        batch_size = source_input.size(0)
        source_input_mask = compute_mask(source_input)
        target_input, target_output = torch.tensor(target_input).to(
            device), torch.tensor(target_output).to(device)
        batch_preds = [[] for _ in range(batch_size)]
        mask = [0 for _ in range(batch_size)]
        encode_state = model.encode(source_input, source_input_mask)
        decode_input = torch.tensor(SOS_ID).repeat(batch_size).view(
            batch_size, 1).to(device)
        # print(hidden_cell_state)
        for _ in range(max_len):
            output_logits, decoder_state = model.decode(
                decode_input, encode_state)  #batch_size, 1, vocab_size
            output_logits = output_logits.squeeze(1)
            output_id = torch.argmax(output_logits, 1).tolist()  # batch_size
            # topv, topi = output_logits.data.topk(1)
            # topi = topi.squeeze(1)
            for idx in range(batch_size):
                if mask[idx]:
                    continue
                id = output_id[idx]
                if id == EOS_ID:
                    mask[idx] = 1
                else:
                    batch_preds[idx].append(target_id2word[str(id)])
            if sum(mask) == len(mask):
                break
            decode_input = torch.tensor(output_id,
                                        device=device).view(batch_size,
                                                            1).to(device)
            encode_state = decoder_state
        preds.extend(batch_preds)
        targets.extend(target_output_token)
    print(preds[:2])
    print(targets[:2])
    return preds, targets
예제 #2
0
파일: train.py 프로젝트: lionsterben/NLP
def train(opt):
    source_vocab_size = opt["source_vocab_size"]
    target_vocab_size = opt["target_vocab_size"]
    d_model = opt["d_model"]
    key_hidden_size = opt["key_hidden_size"]
    value_hidden_size = opt["value_hidden_size"]
    heads = opt["heads"]
    ff_size = opt["ff_size"]
    drop_rate = opt["drop_rate"]
    epoch = opt["epoch"]
    lr_rate = opt["lr_rate"]
    model = Transformer(source_vocab_size, target_vocab_size, d_model,
                        key_hidden_size, value_hidden_size, heads, ff_size,
                        drop_rate).to(device)
    base_dir = "/home/FuDawei/NLP/Machine_Translation/dataset/en-fr-no/"
    source_lines, source_word2id, source_id2word = get_origin_data(
        base_dir + "frtrain_lines", base_dir + "fr_word2id.json",
        base_dir + "fr_id2word.json")
    target_lines, target_word2id, target_id2word = get_origin_data(
        base_dir + "entrain_lines", base_dir + "en_word2id.json",
        base_dir + "en_id2word.json")
    parameters = filter(lambda i: i.requires_grad, model.parameters())
    optimizer = optim.Adam(parameters, lr=lr_rate)
    print_every = 1000
    cnt = 0
    total = 0
    save_dir = "/home/FuDawei/NLP/Machine_Translation/dataset/save_res/"
    batch_size = 64
    max_len = 50
    for ep in range(opt["epoch"]):
        for start_index in range(0, len(source_lines), batch_size):
            source_input, _, source_input_token, _ = generate_batch(
                source_lines, source_word2id, source_id2word, batch_size,
                start_index, False, True, max_len)
            target_input, target_output, target_input_token, target_output_token = generate_batch(
                target_lines, target_word2id, target_id2word, batch_size,
                start_index, True, True, max_len)
            source_input = torch.tensor(source_input).to(device)
            target_input, target_output = torch.tensor(target_input).to(
                device), torch.tensor(target_output).to(device)
            decoder_logits = model(source_input, target_input)
            loss = model.compute_loss(decoder_logits, target_output)
            optimizer.zero_grad()
            loss.backward()
            nn.utils.clip_grad_norm_(parameters, 5)
            optimizer.step()
            cnt += 1
            total += loss.item()
            if cnt % print_every == 0:
                print(total / 1000)
                total = 0
            if cnt % 10000 == 0:
                model.eval()
                preds, targets = get_prediction(base_dir + "frdev_lines",
                                                base_dir + "endev_lines",
                                                model)
                with open(
                        save_dir + str(cnt) + "origin_transformer_preds.json",
                        "w") as f:
                    json.dump(preds, f)
                with open(
                        save_dir + str(cnt) +
                        "origin_transformer_targets.json", "w") as f:
                    json.dump(targets, f)
                # print(eval(preds, targets))

                model.train()
        # if ep > 0 and ep % 2 == 0:
        #     lr_rate = optimizer_lr_rate(d_model, cnt)
        #     for param_group in optimizer.param_groups:
        #         param_group["lr"] = lr_rate
        print(ep)
        lr_rate = lr_rate * 0.5
        for param_group in optimizer.param_groups:
            param_group["lr"] = lr_rate
예제 #3
0
파일: train.py 프로젝트: lionsterben/NLP
def train(opt):
    source_vocab_size = opt["source_vocab_size"]
    target_vocab_size = opt["target_vocab_size"]
    source_hidden_size = opt["source_hidden_size"]
    target_hidden_size = opt["target_hidden_size"]
    source_emb_size = opt["source_emb_size"]
    target_emb_size = opt["target_emb_size"]
    batch_size = opt["batch_size"]
    max_len = opt["max_len"]
    model = Basic_NMT(source_vocab_size, target_vocab_size, source_emb_size,
                      target_emb_size, source_hidden_size,
                      source_hidden_size).to(device)
    base_dir = "/home/FuDawei/NLP/Machine_Translation/dataset/en-fr-no/"
    source_lines, source_word2id, source_id2word = get_origin_data(
        base_dir + "frtrain_lines", base_dir + "fr_word2id.json",
        base_dir + "fr_id2word.json")
    target_lines, target_word2id, target_id2word = get_origin_data(
        base_dir + "entrain_lines", base_dir + "en_word2id.json",
        base_dir + "en_id2word.json")
    # source_lines, source_word2id, source_id2word = get_origin_data("/home/FuDawei/NLP/Machine_Translation/dataset/debug/endebug", base_dir+"en_word2id.json", base_dir+"en_id2word.json")
    # target_lines, target_word2id, target_id2word = get_origin_data("/home/FuDawei/NLP/Machine_Translation/dataset/debug/frdebug", base_dir+"fr_word2id.json", base_dir+"fr_id2word.json")
    parameters = filter(lambda p: p.requires_grad, model.parameters())
    optimizer = optim.Adam(parameters, lr=opt["learning_rate"])
    print_every = 1000
    cnt = 0
    total = 0
    save_dir = "/home/FuDawei/NLP/Machine_Translation/dataset/save_res/"
    for ep in range(opt["epoch"]):
        for start_index in range(0, len(source_lines), batch_size):
            source_input, _, source_input_token, _ = generate_batch(
                source_lines, source_word2id, source_id2word, batch_size,
                start_index, False, True, max_len)
            target_input, target_output, target_input_token, target_output_token = generate_batch(
                target_lines, target_word2id, target_id2word, batch_size,
                start_index, True, True, max_len)
            # print(source_input_token[3])
            # print(target_input_token[3])
            # print(target_output_token[3])

            # # assert 1==0
            # valid_idx = []
            # for idx in range(len(source_input)):
            #     if sum(source_input[idx]) != 0:
            #         valid_idx.append(idx)
            # source_input = np.array(source_input)[valid_idx]
            # target_input = np.array(target_input)[valid_idx]
            # target_output = np.array(target_output)[valid_idx]

            source_input = torch.tensor(source_input).to(device)
            source_input_mask = compute_mask(source_input)
            target_input, target_output = torch.tensor(target_input).to(
                device), torch.tensor(target_output).to(device)
            decoder_logits = model(source_input, source_input_mask,
                                   target_input)
            loss = model.compute_loss(decoder_logits, target_output)
            optimizer.zero_grad()
            loss.backward()
            nn.utils.clip_grad_norm_(parameters, 10)
            optimizer.step()
            cnt += 1
            total += loss.item()
            if cnt % print_every == 0:
                print(total / 1000)
                total = 0
            if cnt % 10000 == 0:
                # topv, topi = decoder_logits.data.topk(1)
                # topi = topi.squeeze(1)
                # for idx in range(5):
                #     pred = ""
                #     real = ""
                #     for i in range(target_output.size(1)):
                #         if target_id2word[str(target_output[idx][i].item())] == "<pad>":
                #             break
                #         real = real + target_id2word[str(target_output[idx][i].item())]+ " "
                #         # real.append(target_id2word[str(target_output[idx][i].item())])
                #         pred = pred + target_id2word[str(topi[idx][i].item())] + " "
                #         # pred.append(target_id2word[str(topi[idx][i].item())])

                #     print(pred)
                #     print("\n")
                #     print(real)
                #     print("\n")

                model.eval()
                preds, targets = get_prediction(base_dir + "frdev_lines",
                                                base_dir + "endev_lines",
                                                model)
                with open(save_dir + str(cnt) + "preds.json", "w") as f:
                    json.dump(preds, f)
                with open(save_dir + str(cnt) + "targets.json", "w") as f:
                    json.dump(targets, f)
        # print(eval(preds, targets))

                model.train()
예제 #4
0
파일: train.py 프로젝트: lionsterben/NLP
def train(opt):
    source_vocab_size = opt["source_vocab_size"]
    target_vocab_size = opt["target_vocab_size"]
    source_hidden_size = opt["source_hidden_size"]
    target_hidden_size = opt["target_hidden_size"]
    source_emb_size = opt["source_emb_size"]
    target_emb_size = opt["target_emb_size"]
    batch_size = opt["batch_size"]
    max_len = opt["max_len"]
    lr_rate = opt["learning_rate"]
    model = Attention_NMT(source_vocab_size, target_vocab_size,
                          source_emb_size, target_emb_size, source_hidden_size,
                          source_hidden_size).to(device)
    base_dir = "/home/FuDawei/NLP/Machine_Translation/dataset/en-fr-no/"
    source_lines, source_word2id, source_id2word = get_origin_data(
        base_dir + "frtrain_lines", base_dir + "fr_word2id.json",
        base_dir + "fr_id2word.json")
    target_lines, target_word2id, target_id2word = get_origin_data(
        base_dir + "entrain_lines", base_dir + "en_word2id.json",
        base_dir + "en_id2word.json")
    # source_lines, source_word2id, source_id2word = get_origin_data("/home/FuDawei/NLP/Machine_Translation/dataset/debug/endebug", base_dir+"en_word2id.json", base_dir+"en_id2word.json")
    # target_lines, target_word2id, target_id2word = get_origin_data("/home/FuDawei/NLP/Machine_Translation/dataset/debug/frdebug", base_dir+"fr_word2id.json", base_dir+"fr_id2word.json")
    parameters = filter(lambda p: p.requires_grad, model.parameters())
    optimizer = optim.Adam(parameters, lr=lr_rate)
    print_every = 1000
    cnt = 0
    total = 0
    save_dir = "/home/FuDawei/NLP/Machine_Translation/dataset/save_res/"
    for ep in range(opt["epoch"]):
        for start_index in range(0, len(source_lines), batch_size):
            source_input, _, source_input_token, _ = generate_batch(
                source_lines, source_word2id, source_id2word, batch_size,
                start_index, False, True, max_len)
            target_input, target_output, target_input_token, target_output_token = generate_batch(
                target_lines, target_word2id, target_id2word, batch_size,
                start_index, True, True, max_len)
            source_input = torch.tensor(source_input).to(device)
            source_input_mask = compute_mask(source_input)
            target_input, target_output = torch.tensor(target_input).to(
                device), torch.tensor(target_output).to(device)
            decoder_logits = model(source_input, source_input_mask,
                                   target_input)
            loss = model.compute_loss(decoder_logits, target_output)
            optimizer.zero_grad()
            loss.backward()
            nn.utils.clip_grad_norm_(parameters, 10)
            optimizer.step()
            cnt += 1
            total += loss.item()
            if cnt % print_every == 0:
                print(total / 1000)
                total = 0
            if cnt % 10000 == 0:
                model.eval()
                preds, targets = get_prediction(base_dir + "frdev_lines",
                                                base_dir + "endev_lines",
                                                model)
                with open(save_dir + str(cnt) + "attention_preds.json",
                          "w") as f:
                    json.dump(preds, f)
                with open(save_dir + str(cnt) + "attention_targets.json",
                          "w") as f:
                    json.dump(targets, f)
        # print(eval(preds, targets))

                model.train()
        if ep > 0 and ep % 2 == 0:
            lr_rate *= 0.5
            for param_group in optimizer.param_groups:
                param_group["lr"] = lr_rate