Пример #1
0
def test_output_file(save_dir='./output_file', save_filename='myTest.txt'):
    SAVE_PATH = save_dir
    FILE_NAME = save_filename
    DATA_PATH = './data/dataset_eval'
    DICT_PATH = './model/save_/vocab.pkl'
    MODEL_PATH = './checkpoint/save_baseline/base_seq2seq500.ckpt'
    EMBEDDING_PATH = './model/save_/embedding_8k.ckpt'
    GPU_NUM = 1

    if os.path.exists(SAVE_PATH) is False:
        os.mkdir(save_dir)

    torch.cuda.set_device(GPU_NUM)

    # vocabulary类
    vocab = pickle.load(open(DICT_PATH, 'rb'))
    config = Config()

    testset = dataset.CompresDataset(vocab=vocab, data_path=DATA_PATH)
    testloader_word = DataLoader(dataset=testset,
                                 collate_fn=my_fn,
                                 batch_size=1,
                                 pin_memory=True)

    model = Seq2Seq(config).cuda()
    model.load(MODEL_PATH)

    embed = nn.Embedding(num_embeddings=8004, embedding_dim=300)
    embed.load_state_dict(torch.load(EMBEDDING_PATH))
    embed.cuda()

    path = os.path.join(SAVE_PATH, FILE_NAME)
    save_file = open(path, 'w')

    pair_num = len(testloader_word)
    print('The number of the sentences pairs is :{}'.format(pair_num))

    for index, (src, trg, labels) in enumerate(testloader_word, 1):
        in_word = src.cuda()
        src = embed(src.cuda())
        trg = embed(trg.cuda())
        labels = labels.cuda()

        out = model(src, trg)

        out = out.view(-1, 2)
        labels = labels.view(-1)

        mask_matrix = labels.ge(0)
        ground_truth = torch.masked_select(labels, mask_matrix)
        predict_labels = torch.masked_select(torch.max(out, 1)[1],
                                             mask_matrix)

        output_list = torch.masked_select(in_word, predict_labels.byte()).tolist()
        text = torch.masked_select(in_word, ground_truth.byte()).tolist()

        sentence_list = vocab.index_list_to_sentence(in_word.squeeze().tolist())
        out_word_list = vocab.index_list_to_sentence(output_list)
        text_list = vocab.index_list_to_sentence(text)

        sentence = ' '.join(sentence_list)
        text = ' '.join(text_list)
        output_word = ' '.join(out_word_list)

        print(index, '/', pair_num)

        save_file.write('{0}\n{1}\n{2}\n{3}\n\n'.format(index, sentence, text, output_word))

    save_file.close()
Пример #2
0
def train_3layers(reload_dataset=False, epoch_num=50, pretrain_model_path=None, optim_fu='adam', visdom_env='3 layers LSTM'):

    # 一些配置
    DATA_DIR = './data/train_pairs'
    DICT_PATH = './checkpoint/dict_20000.pkl'
    EMBEDDING_PATH_RANDOM = './model/save_embedding_97and3.ckpt'
    SAVE_EMBEDDING = False
    RELOAD_DATASET = reload_dataset

    SAVE_DATASET_OBJ = './data/dataset.pkl'
    SAVE_MODEL_PATH = './checkpoint/LSTM3Layers/'
    VISDOM_ENV = visdom_env
    PRINT_STEP = 5
    SAVE_STEP = 1
    GPU_NUM = 1

    torch.manual_seed(2)
    torch.cuda.set_device(GPU_NUM)

    config = LSTM3LayersConfig()
    model = LSTM3Layers(config)

    if pretrain_model_path is not None:
        model.load(pretrain_model_path)
    model.cuda()

    vis = visdom.Visdom(env=VISDOM_ENV)
    if os.path.exists(SAVE_MODEL_PATH) is False:
        os.makedirs(SAVE_MODEL_PATH)

    # 词向量
    embed = nn.Embedding(num_embeddings=20000, embedding_dim=97)
    if SAVE_EMBEDDING is True:
        torch.save(embed.state_dict(), EMBEDDING_PATH_RANDOM)
    else:
        embed.load_state_dict(torch.load(EMBEDDING_PATH_RANDOM))
    embed = embed.cuda()

    # 输入flag标记
    embed_labels = get_flag_embed().cuda()

    criterion = nn.CrossEntropyLoss(ignore_index=2)    # ignore padding index
    optimizer = get_optimizer(optim_fu, model, config.lr)

    # vocabulary类
    vocab = pickle.load(open(DICT_PATH, 'rb'))

    if RELOAD_DATASET is True:
        data = dataset.CompresDataset(vocab=vocab, data_path=DATA_DIR)
        with open(SAVE_DATASET_OBJ, 'wb') as f:
            pickle.dump(data, f)
    else:
        data = pickle.load(open(SAVE_DATASET_OBJ, 'rb'))

    print('The length of the data is: {}'.format(len(data)))

    trainloader = DataLoader(dataset=data,
                             collate_fn=my_fn,
                             batch_size=config.batch_size,
                             pin_memory=True if torch.cuda.is_available() else False,
                             shuffle=True)

    vis.text('Running the seq2seq at {}'.format(time.strftime('%x %X')), win='log')

    episode = 0
    loss_sum = 0
    axis_index = 0
    correct_num = 0
    batch_num = 0
    recall_correct = 0
    recall_all = 0
    C_rate_remain = 0
    C_rate_all = 0
    save_index = 0

    for epoch in range(epoch_num):
        for src, trg, labels in trainloader:

            src = embed(src.cuda())
            trg = embed(trg.cuda())

            flag4encoder = torch.zeros(src.shape[0], src.shape[1], 3).cuda()
            src = torch.cat([src, flag4encoder], dim=2)

            flag4decoder = torch.zeros([labels.shape[0], 1]).long()
            flag4decoder = torch.cat([flag4decoder, labels[:, :-1]], dim=1).cuda()
            flag4decoder = embed_labels(flag4decoder)

            trg = torch.cat([trg, flag4decoder], dim=2)
            labels = labels.cuda()

            out, _ = model(src, trg)
            out = out.view(-1, 2)
            labels = labels.view(-1)
            loss = criterion(out, labels)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            mask_matrix = (labels < 2)
            ground_truth = torch.masked_select(labels, mask_matrix)
            predict_labels = torch.masked_select(torch.max(out, 1)[1],
                                                 mask_matrix)

            print('g', ground_truth)
            print('p', predict_labels)

            C_rate_all += len(predict_labels)
            C_rate_remain += torch.sum(predict_labels).item()

            correct_num += torch.sum(predict_labels == ground_truth).item()
            batch_num += len(ground_truth)
            # 训练的召回率计算
            recall_correct += torch.sum(ground_truth & predict_labels).item()
            recall_all += torch.sum(ground_truth).item()
            # 记录loss
            loss_sum += loss.item()

            if episode % PRINT_STEP == 0 and episode != 0:
                # calculate the precision P, recall R, and F1
                P = correct_num / batch_num
                R = recall_correct / recall_all
                F1 = 2 * P * R / (P + R)

                # calculate for the compression rate.
                C_rate = C_rate_remain / C_rate_all

                draw_line(vis, axis_index, (P, R, F1), names=('Precision', 'Recall', 'F1 score'))
                draw_line(vis, axis_index, (loss_sum/PRINT_STEP,), names=('Step Loss',))
                draw_line(vis, axis_index, (1-C_rate,), names=('Compression Rate',))

                # update
                correct_num = 0
                batch_num = 0
                recall_correct = 0
                recall_all = 0
                axis_index += 1
                loss_sum = 0
                C_rate_all = 0
                C_rate_remain = 0

                # save model
                if axis_index % SAVE_STEP == 0:
                    model.save(SAVE_MODEL_PATH + 'model{}.ckpt'.format(epoch))
                    save_index += 1
            episode += 1
        vis.text(time.strftime('%x %X') + 'finished the epoch {}'.format(epoch), win='log', append=True)
Пример #3
0
    EMBED_PATH = './model/save_embedding_97and3.ckpt'
    SAVE_PATH = './test_out'
    SAVE_FILE = 'demo.txt'
    SAVE_DIR = os.path.join(SAVE_PATH, SAVE_FILE)

    # MODEL_PATH = './checkpoint/normal/transformers_epoch90.ckpt'
    MODEL_PATH = './checkpoint/Transformer_lr0.0003_b200_head10_layer2_ff100_no_pos/transformers_epoch90.ckpt'
    # MODEL_PATH = './checkpoint/transformers_epoch90.ckpt'

    if os.path.exists(SAVE_PATH) is False:
        os.makedirs(SAVE_PATH)

    vocab = pickle.load(open(DICT_PATH, 'rb'))

    data = dataset.CompresDataset(vocab=vocab,
                                  data_path=TEST_DIR,
                                  reverse_src=False)
    testloader = DataLoader(
        dataset=data,
        collate_fn=my_fn,
        batch_size=1000 if torch.cuda.is_available() else 2,
        # batch_size=2,
        pin_memory=True if torch.cuda.is_available() else False,
        shuffle=True)

    model = BasicTransformer(d_model=100,
                             nhead=10,
                             num_encoder_layer=2,
                             num_decoder_layer=2,
                             dim_feedforward=100)
Пример #4
0
def test_3layerLSTM_Res(model_num=1):
    # DATA_PATH = './data/dataset_eval'
    DATA_PATH = './nbc'

    DICT_PATH = './checkpoint/dict_20000.pkl'
    # MODEL_PATH = './checkpoint/save_seq2seq1layer/base_seq2seq{}.ckpt'
    EMBEDDING_PATH_RANDOM = './model/save_embedding_97and3.ckpt'
    GPU_NUM = 0
    # MODEL_PATH = './checkpoint/save_seq2seq1layer_withFLAGS/base_seq2seq_epoch{}.ckpt'
    MODEL_PATH = './checkpoint/LSTM3LayersRes/model{}.ckpt'
    BATCH_SIZE = 100

    torch.cuda.set_device(GPU_NUM)

    # vocabulary类
    vocab = pickle.load(open(DICT_PATH, 'rb'))

    testset = dataset.CompresDataset(vocab=vocab, data_path=DATA_PATH)
    testloader = DataLoader(dataset=testset,
                            collate_fn=my_fn,
                            batch_size=BATCH_SIZE,
                            pin_memory=True)

    config = LSTM3LayersResConfig()
    model = LSTM3LayersRes(config).cuda()

    model.load(MODEL_PATH.format(model_num))

    # 预训练好的词向量读取
    embed = nn.Embedding(num_embeddings=20000, embedding_dim=97)
    embed.load_state_dict(torch.load(EMBEDDING_PATH_RANDOM))
    embed.cuda()

    embed_flag = get_flag_embed().cuda()

    correct_num = 0
    all_num = 0

    recall_correct = 0
    recall_all = 0

    model.eval()
    for i, (src, trg, labels) in enumerate(testloader):
        print(i*src.shape[0])
        src = embed(src.cuda())
        trg = trg.cuda()
        labels = labels.cuda()
        # 添加三位全0向量给encoder输入
        flag4encoder = torch.zeros(src.shape[0], src.shape[1], 3).cuda()
        src = torch.cat([src, flag4encoder], dim=2)

        _, hidden = model.step_encoding(src)    # get the encoder hidden state
        # hidden = tuple([state.view(config.num_layers, -1, config.hidden_size) for state in hidden])

        output_labels = []
        input_flag = [[2] for j in range(trg.shape[0])]
        input_flag = torch.Tensor(input_flag).long().cuda()
        for index in range(trg.shape[1]):
            # Prepare for the input
            flag4encoder = embed_flag(input_flag)
            select_elem = torch.index_select(trg, 1, torch.tensor(index).cuda())
            decoder_input = embed(select_elem)
            decoder_input = torch.cat([decoder_input, flag4encoder], dim=2)

            out, hidden = model.step_decoding(decoder_input, hidden)
            input_flag = torch.max(out, 2)[1]
            output_labels.append(input_flag)

        output_labels = torch.cat(output_labels, dim=1)
        labels = labels.squeeze()
        mask_matrix = labels < 2
        predict_labels = torch.masked_select(output_labels, mask_matrix)
        ground_truth = torch.masked_select(labels, mask_matrix)

        correct_num += torch.sum((predict_labels == ground_truth).long()).item()
        recall_correct += torch.sum((predict_labels & ground_truth).long()).item()
        recall_all += torch.sum(ground_truth).item()
        all_num += len(ground_truth)

        P = correct_num / all_num
        R = recall_correct / recall_all
        F1 = 2 * P * R / (P + R)
        print('Precision is {}'.format(P))
        print('Recall is {}'.format(R))
        print('F1 is {} \n'.format(F1))

    P = correct_num / all_num
    R = recall_correct / recall_all
    F1 = 2 * P * R / (P + R)
    print('Finally')
    print('\tPrecision is {}'.format(P))
    print('\tRecall is {}'.format(R))
    print('\tF1 is {}'.format(F1))
    return P, R, F1
Пример #5
0
    embed = embed.cuda()
    embed_labels = embed_labels.cuda()
    model.cuda()
    torch.cuda_set_device(GPU_NUM)
    print("CUDA available")

else:
    print("CUDA unavailable")

# Training Config
criterion = nn.CrossEntropyLoss(ignore_index=2)
optimizer = torch.optim.Adam(model.parameters(), lr=3e-4)

# Training dataset
vocab = pickle.load(open(DICT_PATH, 'rb'))
data = dataset.CompresDataset(vocab=vocab, data_path=DATA_DIR)
print("the number of the training data is: {}".format(len(data)))
trainloader = DataLoader(dataset=data,
                         collate_fn=my_fn,
                         batch_size=BATCH_SIZE,
                         pin_memory=True if torch.cuda.is_available() else False,
                         shuffle=True)

# Testing dataset
testset = dataset.CompresDataset(vocab=vocab, data_path=TEST_DIR)
testloader = DataLoader(dataset=testset,
                        collate_fn=my_fn,
                        batch_size=BATCH_SIZE,
                        pin_memory=True,
                        shuffle=False)