Пример #1
0
def build_voca_test():
    config = get_config()
    train_json_path = config['caption_train_path']
    with open(train_json_path, "r") as f:
        train_data = json.load(f)
    voca_root_path = "C:\\Users\\multicampus\\s02p23c104\\Back\\AI\\datasets\\caption\\"
    build_voca(train_data, 5, voca_root_path)
Пример #2
0
def attenntion_training_test():
    config = get_config()
    print(config)
    AI_DIREC = "C:\\Users\\multicampus\\s02p23c104\\Back\\AI"
    vocab_path = AI_DIREC + config['caption_vocab_path']
    word2idx_path = AI_DIREC + config['word2idx_train_path']
    image_path = config['caption_train_image_path']
    save_path = AI_DIREC + config['checkpoints_saved_path']
    print(save_path)
    caption_path = AI_DIREC + config['caption_train_path']

    print(vocab_path)
    print(word2idx_path)
    mini_batch_loss, encoder, decoder = attention_caption_train(
        vocab_path,
        image_path,
        config,
        caption_path,
        word2idx_path=word2idx_path)

    datestr = date2str()
    save_config(config, "attention_config" + datestr, save_path)
    save_loss(mini_batch_loss, "attention_loss" + datestr, save_path)
    save_model(encoder, "attention_encoder" + datestr, save_path)
    save_model(decoder, "attention_decoder" + datestr, save_path)
Пример #3
0
def caption_test_test():
    config = get_config()
    vocab_path = config['caption_vocab_path']
    image_path = config['caption_train_image_path']
    encoder_path = config['caption_encoder_path']
    decoder_path = config['caption_decoder_path']
    caption_path = config['caption_test_path']
    config_path = config['config_path']
    word2idx_path = config['word2idx_test_path']
    batch = 1
    max_sequence_len = 30

    #test(vocab_path,encoder_path,decoder_path,caption_path,image_path,config_path,batch,max_sequence_len,word2idx_path=None)
    images, result_captions, original_captions = caption_test(
        vocab_path,
        encoder_path,
        decoder_path,
        caption_path,
        image_path,
        config_path,
        batch,
        max_sequence_len,
        word2idx_path=word2idx_path)
    print(result_captions)
    visualize_img_caption(images, result_captions)
Пример #4
0
def train_procedure_test():
    config = get_config()
    load_path = config['word2idx_train_path']
    voca_path = config['caption_vocab_path']
    dataset = load_tokenized_data(load_path)
    voca = load_voca(voca_path)
    batch_size = 2
    embed_size = 10
    vocab_len = len(voca)
    hidden_layer = 1
    hidden_size = 10
    loader = make_caption_loader(dataset, batch_size,
                                 config['caption_train_image_path'])

    dataiter = iter(loader)
    images, caption, length = dataiter.next()

    # data형태 확인하기
    print("Data 형태 확인")
    print(images.size())
    print(caption.size())

    encoder = EncoderCNN(embed_size)
    decoder = DecoderRNN(embed_size, vocab_len, hidden_layer, hidden_size)

    grad_params = list(encoder.linear.parameters())

    loss_function = nn.CrossEntropyLoss()
    optimizer = optim.Adam(params=grad_params, lr=0.001)

    compare_target = pack_padded_sequence(caption, length,
                                          batch_first=True).data

    feature = encoder(images)
    output = decoder(caption, feature, length)

    loss = loss_function(output, compare_target)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    datestr = date2str()
    save_path = config['checkpoints_saved_path']
    mini_batch_loss = []
    mini_batch_loss.append(loss.item())
    save_config(config, "config" + datestr, save_path)
    save_loss(mini_batch_loss, "loss" + datestr, save_path)
    save_model(encoder, "encoder" + datestr, save_path)
    save_model(decoder, "decoder" + datestr, save_path)
    print(
        "optimzer.zero_grad()와 encoder.zero_grad() , decoder.zero_grad()와 같을 까?"
    )
    print("optimizer.zero_grad() 호출하기 전")
    print(encoder.linear.weight.grad)
    print("optimizer.zero_grad() 호출한 후")
    optimizer.zero_grad()
    print(encoder.linear.weight.grad)
    print("====================")
    print(grad_params)
Пример #5
0
def save_caption2idx_test():
    config = get_config()
    AI_DIREC = "C:\\Users\\multicampus\\s02p23c104\\Back\\AI"
    voca_path = AI_DIREC + config['caption_vocab_path']
    train_json_path = AI_DIREC + config['caption_train_path']
    voca = load_voca(voca_path)
    dataset = tokenized_data(train_json_path, voca)
    save_tokenized_data(dataset=dataset, AI_DIREC=AI_DIREC)
Пример #6
0
def loader_test():
    config = get_config()
    load_path = config['word2idx_test_path']
    voca_path = config['caption_vocab_path']
    dataset = load_tokenized_data(load_path)
    print(dataset['image_list'])
    voca = load_voca(voca_path)

    loader = make_caption_loader(dataset, 10, config['train_image_path'])
    dataiter = iter(loader)
    images, padded_caption, caption_length = dataiter.next()
    print(images)
Пример #7
0
def save_tokenized_data(dataset, AI_DIREC, save_path=None, type="train"):
    if type is "test":
        name = "wordVector_test"
    elif type is "val":
        name = "wordVector_val"
    else:
        name = "wordVector_train_5"
    name += ".pickle"
    if save_path is None:
        save_path = AI_DIREC + get_config()['caption_train_word_saved_path']
    save_path += name
    with open(save_path, 'wb') as f:
        pickle.dump(dataset, f)
Пример #8
0
def korean_gpt_long_setence_life_test():
    config = get_config()
    kogpt2_config = get_kog_config()
    kogpt2_model_path = "C:\\Users\\multicampus\\s02p23c104\\Back\\AI\\checkpoints\\kogpt_life_model_20_2020-04-26-23-56-31.pth"

    kogpt2_vocab_path = config['kogpt_vocab_path']
    kogpt2model = GPT2LMHeadModel(config=GPT2Config.from_dict(kogpt2_config))
    torch.load(kogpt2_model_path)
    kogpt2model.load_state_dict(torch.load(kogpt2_model_path))

    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    kogpt2model.to(device)
    kogpt2model.eval()
    vocab = nlp.vocab.BERTVocab.from_sentencepiece(kogpt2_vocab_path,
                                                   mask_token=None,
                                                   sep_token=None,
                                                   cls_token=None,
                                                   unknown_token='<unk>',
                                                   padding_token='<pad>',
                                                   bos_token='<s>',
                                                   eos_token='</s>')
    tok = SentencepieceTokenizer(kogpt2_vocab_path)

    sent = '나는 밥을 먹었'
    toked = tok(sent)
    print(toked)
    sent_cnt = 0

    input_ids = torch.tensor([
        vocab[vocab.bos_token],
    ] + vocab[toked]).unsqueeze(0)
    input_ids = input_ids.to(device)

    outputs = kogpt2model.generate(input_ids=input_ids,
                                   max_length=100,
                                   min_length=50,
                                   repetition_penalty=1.2,
                                   do_sample=True,
                                   num_beams=3,
                                   bos_token_id=0,
                                   pad_token_id=3,
                                   eos_token_id=1,
                                   num_return_sequences=3)

    target = outputs[0]
    print("========수필===========")
    for i in range(3):  # 3 output sequences were generated
        toked = vocab.to_tokens(outputs[i].squeeze().tolist())
        ret = re.sub(r'(<s>|</s>|<pad>|<unk>)', '',
                     ''.join(toked).replace('▁', ' ').strip())
        print('Generated {}: {}'.format(i, ret))
Пример #9
0
def fine_tune_test():
    AI_DIRECTORY = "C:\\Users\\multicampus\\yye\\s02p23c104\\Back\\AI"
    #AI_DIRECTORY = "C:\\Users\\multicampus\\s02p23c104\\Back\\AI"
    config = get_config()
    fine_tune_num = 0
    new_kogpt_model, loss_record = fine_tuning(config, fine_tune_num,
                                               AI_DIRECTORY)
    epoch = config['kogpt_epoch']
    datestr = date2str()
    default_name = "kogpt_story_"
    model_name = default_name + "model_" + str(epoch) + "_" + datestr
    loss_name = default_name + "loss_" + datestr
    root_path = AI_DIRECTORY + config['checkpoints_saved_path']
    save_model(new_kogpt_model, model_name, root_path)
    save_loss(loss_record, loss_name, root_path)
Пример #10
0
def build_korean_to_idx_test():
    config = get_config()
    #AI_DIRECTORY = "C:\\Users\\multicampus\\yye\\s02p23c104\\Back\\AI"
    AI_DIRECTORY = "C:\\Users\\multicampus\\s02p23c104\\Back\\AI"
    kogpt2_vocab_path = AI_DIRECTORY + config['kogpt_vocab_path']
    vocab = nlp.vocab.BERTVocab.from_sentencepiece(kogpt2_vocab_path,
                                                   mask_token=None,
                                                   sep_token=None,
                                                   cls_token=None,
                                                   unknown_token='<unk>',
                                                   padding_token='<pad>',
                                                   bos_token='<s>',
                                                   eos_token='</s>')
    tok = SentencepieceTokenizer(kogpt2_vocab_path)
    file_path = AI_DIRECTORY + "\\datasets\\kogpt\\story_train_pkl.pkl"
    save_path = AI_DIRECTORY + "\\datasets\\kogpt\\"
    build_korean_to_idx(file_path, save_path, vocab, tok, block_size=256)
Пример #11
0
def dataset_split_save(path):
    data = get_path_caption(path)
    data_keys = list(data.keys())

    total_size = len(data)
    cfg = get_config()
    src = cfg['image_path']
    dst_train = src+'train\\'
    # total에서 6:2:2로 train ,validation, function_test data를 나눌 것입니다.
    print(src)
    print(dst_train)
    train_ratio = 0.6
    train_size = int(total_size*train_ratio)
    train_data_keys = data_keys[:train_size]
    train_data = dict()
    for image_path in train_data_keys:
        shutil.move(src+image_path,dst_train+image_path)
        train_data[image_path] = data[image_path]

    dst_val = src+'val\\'
    val_ratio = 0.2
    val_size = int(total_size*val_ratio)
    val_data_keys = data_keys[train_size:train_size+val_size]
    val_data = dict()
    for image_path in val_data_keys:
        shutil.move(src+image_path,dst_val+image_path)
        val_data[image_path] = data[image_path]

    dst_test = src+'function_test\\'
    test_data_keys = data_keys[train_size+val_size:]
    test_data = dict()
    for image_path in test_data_keys:
        shutil.move(src+image_path,dst_test+image_path)
        test_data[image_path] = data[image_path]

    with open("../datasets/train_data.json", 'w', encoding='utf-8') as train_json:
        json.dump(train_data, train_json)

    with open("../datasets/val_data.json", 'w', encoding='utf-8') as val_json:
        json.dump(val_data, val_json)

    with open("../datasets/test_data.json", 'w', encoding='utf-8') as test_json:
        json.dump(test_data, test_json)

    return train_data,val_data,test_data
Пример #12
0
def caption_train_test():
    config = get_config()
    print(config)
    vocab_path = config['caption_vocab_path']
    word2idx_path = config['word2idx_train_path']
    image_path = config['caption_train_image_path']
    save_path = config['checkpoints_saved_path']
    caption_path = config['caption_train_path']

    mini_batch_loss, encoder, decoder = caption_train(
        vocab_path,
        image_path,
        config,
        caption_path,
        word2idx_path=word2idx_path)

    datestr = date2str()
    save_config(config, "config" + datestr, save_path)
    save_loss(mini_batch_loss, "loss" + datestr, save_path)
    save_model(encoder, "encoder" + datestr, save_path)
    save_model(decoder, "decoder" + datestr, save_path)
Пример #13
0
def all_datasets_split(path):
    images, captions = get_imagepath_caption(path)
    images_size = len(images)
    cfg = get_config()
    src = cfg['origin']
    dst_train = src + 'train\\'
    train_ratio = 0.6
    train_size = int(images_size * train_ratio)
    train_data_images = images[:train_size]
    train_data = {}
    for idx, image in enumerate(train_data_images):
        shutil.move(src + image, dst_train + image)
        train_data[image] = captions[idx:idx+5]
    dst_val = src + 'validation\\'
    val_ratio = 0.2
    val_size = int(images_size * val_ratio)
    val_data_keys = images[train_size:train_size + val_size]
    val_data = dict()
    for idx, image_path in enumerate(val_data_keys):
        shutil.move(src + image_path, dst_val + image_path)
        val_data[image_path] = captions[train_size + idx:train_size + idx+5]

    dst_test = src + 'function_test\\'
    test_data_keys = images[train_size + val_size:]
    test_data = dict()
    for idx, image_path in enumerate(test_data_keys):
        shutil.move(src + image_path, dst_test + image_path)
        test_data[image_path] = captions[train_size + val_size + idx : train_size + val_size + idx + 5]

    with open("../datasets/train_data_yyejej.json", 'w', encoding='utf-8') as train_json:
        json.dump(train_data, train_json)

    with open("../datasets/val_data_yyejej.json", 'w', encoding='utf-8') as val_json:
        json.dump(val_data, val_json)

    with open("../datasets/test_data_yyejej.json", 'w', encoding='utf-8') as test_json:
        json.dump(test_data, test_json)

    return train_data, val_data, test_data
Пример #14
0
def kogpt_test():
    config = get_config()
    tok_path = get_tokenizer()
    model, vocab = get_pytorch_kogpt2_model()
    tok = SentencepieceTokenizer(tok_path)
    sent = '나는 밥을 먹었'
    toked = tok(sent)
    input_ids = torch.tensor([
        vocab[vocab.bos_token],
    ] + vocab[toked]).unsqueeze(0)
    output = model(input_ids=input_ids)
    while 1:
        input_ids = torch.tensor([
            vocab[vocab.bos_token],
        ] + vocab[toked]).unsqueeze(0)
        pred = model(input_ids)[0]
        gen = vocab.to_tokens(torch.argmax(pred,
                                           axis=-1).squeeze().tolist())[-1]
        print(gen)
        if gen == '</s>':
            break
        sent += gen.replace('▁', ' ')
        toked = tok(sent)
    print(sent)
Пример #15
0
def caption_test(vocab_path,
                 encoder_path,
                 decoder_path,
                 caption_path,
                 image_path,
                 config_path,
                 batch,
                 max_sequence_len,
                 word2idx_path=None):
    vocab = load_voca(vocab_path)
    cfg = get_config(config_path)

    embed_size = cfg['caption_embed_size']
    vocab_size = len(vocab)
    hidden_layers_num = cfg['caption_hidden_layer']
    hidden_size = cfg['caption_hidden_size']

    if word2idx_path is not None:
        dataset = load_tokenized_data(word2idx_path)
    else:
        dataset = tokenized_data(caption_path, vocab, type="test")
        save_tokenized_data(dataset, type="test")

    encoder = EncoderCNN(embed_size)
    decoder = DecoderRNN(embed_size, vocab_size, hidden_layers_num,
                         hidden_size)

    encoder.load_state_dict(torch.load(encoder_path))
    decoder.load_state_dict(torch.load(decoder_path))

    encoder.eval()
    decoder.eval()

    loader = make_caption_loader(dataset, batch, image_path)

    test_data_iter = iter(loader)
    images, captions, length = test_data_iter.next()

    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    device_images = images.to(device)
    features = encoder(images)
    states = None

    # features의 형태는 (batch,embed_size)인 2차원입니다. 그러나
    # 이후 사용될 lstm은 input으로 (batch,num of embeddings,embed_size) 3차원 형태를 요구하기 때문에
    # features의 차원을 강제로 늘려줍니다.
    lstm_inputs = features.unsqueeze(1)
    predicted_index = []
    for i in range(max_sequence_len):
        outputs, states = decoder.lstm(lstm_inputs, states)
        # outputs을 linear 레이어의 인풋을 위해 2차원 배열로 만들어 줘야함
        outputs = outputs.squeeze(1)
        scores_per_batch = decoder.score_layer(outputs)
        values, predicted = scores_per_batch.max(1)
        predicted_index.append(predicted)
        lstm_inputs = decoder.embed(predicted)
        lstm_inputs = lstm_inputs.unsqueeze(1)

    # tensor를 포함한 그냥 1차원 짜리 리스트 [batch * max_sequence_len] => 2차원의 매트릭스 [batch X max_sequence_len] 바꿔줘야 함
    # ex)
    # predicted_index = [tensor([0,3,6]),tensor([1,4,7]),tensor([2,5,8])]
    # 이걸
    # [0,1,2]
    # [3,4,5]
    # [6,7,8] 이렇게 바꿔줘야 함
    # 2차원 짜리를 만들건데 기존의 리스트는 dim 0 방향이 되고(세로방향)
    # 새로 붙이는 리스트는 dim 1 방향으로 붙여야 함(가로 방향)

    predicted_index = torch.stack(predicted_index, dim=1)
    # 현재 tensor가 gpu에 있으므로 cpu로 옮겨서 연산을 해야함.
    predicted_index = predicted_index.cpu().numpy()

    result_captions = []
    for wordindices in predicted_index:
        caption = []
        for index in wordindices:
            word = vocab.idx2word[index]
            if word == '<end>':
                break
            if word == '<unk>' or word == '<start>':
                continue
            caption.append(word)
        result_captions.append(caption)

    return images, result_captions, captions
Пример #16
0
def predict(images,root_path,AI_directory_path,model_type="life"):
    config = get_config()
    #0. Extract captions from images
    vocab = load_voca(AI_directory_path+config['caption_vocab_path'])
    caption_embed_size = config['caption_embed_size']
    caption_hidden_layer = config['caption_hidden_layer']
    caption_hidden_size = config['caption_hidden_size']
    caption_encoder_path = AI_directory_path+config['caption_encoder_path']
    caption_decoder_path = AI_directory_path+config['caption_decoder_path']
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    max_sequence_len = 30 #default value

    transform = torch_transform.Compose([
        torch_transform.ToTensor(),
        torch_transform.Normalize(mean=(0.4444, 0.4215, 0.3833), std=(0.2738, 0.2664, 0.2766))])

    encoder = EncoderCNN(caption_embed_size)
    decoder = DecoderRNN(caption_embed_size, len(vocab), caption_hidden_layer, caption_hidden_size)

    encoder.load_state_dict(torch.load(caption_encoder_path,map_location=device))
    decoder.load_state_dict(torch.load(caption_decoder_path,map_location=device))
    images = load_image(images, root_path, transform)

    encoder.eval()
    decoder.eval()

    encoder.to(device)
    decoder.to(device)
    images = images.to(device)

    features = encoder(images)
    states = None
    predicted_index = []
    lstm_inputs = features.unsqueeze(1)

    for i in range(max_sequence_len):
        outputs,states = decoder.lstm(lstm_inputs,states)
        # outputs을 linear 레이어의 인풋을 위해 2차원 배열로 만들어 줘야함
        outputs = outputs.squeeze(1)
        scores_per_batch = decoder.score_layer(outputs)
        values, predicted = scores_per_batch.max(1)
        predicted_index.append(predicted)
        lstm_inputs = decoder.embed(predicted)
        lstm_inputs = lstm_inputs.unsqueeze(1)

    predicted_index = torch.stack(predicted_index,dim=1)
    predicted_index = predicted_index.cpu().numpy()

    result_captions = []
    for wordindices in predicted_index:
        text =""
        for index in wordindices:
            word = vocab.idx2word[index]
            if word == '<end>':
                break
            if word == '<unk>' or word == '<start>':
                continue
            text += word + " "
        result_captions.append(text)

    print("result_caption : ",result_captions)
    # 1. translate captions to korean

    korean_sentences = []
    for sent in result_captions:
        translate_result = get_translate(sent)
        if translate_result != -1:
            translate_result = re.sub(r'\.','',translate_result)
            korean_sentences.append(translate_result)
    print("result_korean : ",korean_sentences)

    kogpt2_config = get_kog_config()
    if model_type == "life":
        kogpt2_model_path = AI_directory_path+config['kogpt_life_model_path']
    elif model_type == "story":
        kogpt2_model_path = AI_directory_path + config['kogpt_story_model_path']
    else:
        kogpt2_model_path = AI_directory_path+config['kogpt_model_path']
    kogpt2_vocab_path = AI_directory_path+config['kogpt_vocab_path']
    kogpt2model = GPT2LMHeadModel(config=GPT2Config.from_dict(kogpt2_config))
    kogpt2model.load_state_dict(torch.load(kogpt2_model_path,map_location=device))

    kogpt2model.to(device)
    kogpt2model.eval()
    vocab = nlp.vocab.BERTVocab.from_sentencepiece(kogpt2_vocab_path,
                                                         mask_token=None,
                                                         sep_token=None,
                                                         cls_token=None,
                                                         unknown_token='<unk>',
                                                         padding_token='<pad>',
                                                         bos_token='<s>',
                                                         eos_token='</s>')
    tok = SentencepieceTokenizer(kogpt2_vocab_path)

    korean_preprocess(korean_sentences)
    gpt_result = naive_prediction(korean_sentences,tok,vocab,device,kogpt2model,model_type)
    korean_postprocess(gpt_result)
    result = []
    make_sentence(gpt_result,"",result,0)
    result.sort(key=lambda item: (-len(item),item))
    result_len = len(result)
    if result_len >11:
        result_len = 11
    result = result[1:result_len]
    return result
Пример #17
0
def attention_beam_search_test(images, root_path):
    config = get_config()
    # 0. Extract captions from images
    AI_directory_path = "C:\\Users\\multicampus\\s02p23c104\\Back\\AI"
    vocab = load_voca(AI_directory_path +
                      config['caption_attention_vocab_path'])
    emb_dim = config['caption_embed_size']
    decoder_dim = config['caption_hidden_size']
    attention_dim = config['caption_attention_dim']
    dropout = config['caption_dropout_ratio']
    caption_encoder_path = AI_directory_path + config[
        'caption_attention_encoder_path']
    caption_decoder_path = AI_directory_path + config[
        'caption_attention_decoder_path']
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    max_sequence_len = 50  # default value

    transform = torch_transform.Compose([
        torch_transform.ToTensor(),
        torch_transform.Normalize(mean=(0.4444, 0.4215, 0.3833),
                                  std=(0.2738, 0.2664, 0.2766))
    ])

    encoder = Encoder()
    decoder = DecoderWithAttention(attention_dim=attention_dim,
                                   embed_dim=emb_dim,
                                   decoder_dim=decoder_dim,
                                   vocab_size=len(vocab),
                                   dropout=dropout)

    encoder.load_state_dict(
        torch.load(caption_encoder_path, map_location=device))
    decoder.load_state_dict(
        torch.load(caption_decoder_path, map_location=device))
    images = load_image(images, root_path, transform)

    encoder.eval()
    decoder.eval()

    encoder.to(device)
    decoder.to(device)
    images = images.to(device)
    batch = images.shape[0]

    predicted_index = []
    encoder_out = encoder(
        images)  # (1, enc_image_size, enc_image_size, encoder_dim)
    enc_image_size = encoder_out.size(1)
    encoder_dim = encoder_out.size(3)

    # Flatten encoding
    encoder_out = encoder_out.view(batch, -1,
                                   encoder_dim)  # (1, num_pixels, encoder_dim)
    num_pixels = encoder_out.size(1)
    k_prev_words = torch.LongTensor([[vocab('<start>')]] * batch).to(device)
    h, c = decoder.init_hidden_state(encoder_out)
    for i in range(max_sequence_len):
        embeddings = decoder.embedding(k_prev_words).squeeze(
            1)  # (s, embed_dim)
        awe, _ = decoder.attention(encoder_out,
                                   h)  # (s, encoder_dim), (s, num_pixels)
        gate = decoder.sigmoid(
            decoder.f_beta(h))  # gating scalar, (s, encoder_dim)
        awe = gate * awe
        h, c = decoder.decode_step(torch.cat([embeddings, awe], dim=1),
                                   (h, c))  # (s, decoder_dim)
        scores = decoder.fc(h)  # (s, vocab_size)
        _, predicted = scores.max(1)
        predicted_index.append(predicted)
        k_prev_words = predicted.unsqueeze(1)

    predicted_index = torch.stack(predicted_index, dim=1)
    predicted_index = predicted_index.cpu().numpy()

    result_captions = []
    for wordindices in predicted_index:
        text = ""
        for index in wordindices:
            word = vocab.idx2word[index]
            if word == '<end>':
                break
            if word == '<unk>' or word == '<start>':
                continue
            text += word + " "
        result_captions.append(text)

    print("result_caption : ", result_captions)
Пример #18
0
def kogpt_life_recursive_test():
    config = get_config()
    AI_directory_path = "C:\\Users\\multicampus\\s02p23c104\\Back\\AI"
    kogpt2_config = get_kog_config()
    kogpt2_model_path = "C:\\Users\\multicampus\\s02p23c104\\Back\\AI\\checkpoints\\kogpt_life_model_20_2020-04-26-23-56-31.pth"

    kogpt2_vocab_path = AI_directory_path + config['kogpt_vocab_path']
    kogpt2model = GPT2LMHeadModel(config=GPT2Config.from_dict(kogpt2_config))
    torch.load(kogpt2_model_path)
    kogpt2model.load_state_dict(torch.load(kogpt2_model_path))

    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    kogpt2model.to(device)
    kogpt2model.eval()
    vocab = nlp.vocab.BERTVocab.from_sentencepiece(kogpt2_vocab_path,
                                                   mask_token=None,
                                                   sep_token=None,
                                                   cls_token=None,
                                                   unknown_token='<unk>',
                                                   padding_token='<pad>',
                                                   bos_token='<s>',
                                                   eos_token='</s>')
    tok = SentencepieceTokenizer(kogpt2_vocab_path)

    # sent = ' 신랑 신부가 결혼식 파티 앞에서 사진을 찍기 위해 포즈를 취하고 있자, 신부님은 웨딩드레파란 셔츠를 입은 남자가 사다리에 서 있'
    # toked = tok(sent)
    # print(toked)
    # sent_cnt = 0

    # input_ids = torch.tensor([vocab[vocab.bos_token], ] + vocab[toked]).unsqueeze(0)
    # input_ids = input_ids.to(device)
    korean_sentences = [
        '신랑 신부가 결혼식 파티 앞에서 사진을 찍기 위해 포즈를 취하고 있다.', '파란 셔츠를 입은 남자가 사다리에 서 있다.',
        '두 남자가 서 있다'
    ]
    kogpt_input_sentences = []
    for korean in korean_sentences:
        korean_size = len(korean)
        if not kogpt_input_sentences:
            korean_size = len(korean)
            if korean_size > 3:
                kogpt_input_sentences.append(korean[:-2])
            elif korean_size > 1:
                kogpt_input_sentences.append(korean[:-1])
            else:
                kogpt_input_sentences.append(korean)
        else:
            for i in range(len(kogpt_input_sentences)):
                if korean_size > 3:
                    kogpt_input_sentences[i] += korean[:-2]
                elif korean_size > 1:
                    kogpt_input_sentences[i] += korean[:-1]
                else:
                    kogpt_input_sentences[i] += korean[:]
        kogpt_output_sentences = []
        print(kogpt_input_sentences)
        expected_length = 50
        for kogpt_input_sentence in kogpt_input_sentences:
            print(kogpt_input_sentence)
            toked = tok(kogpt_input_sentence)
            input_ids = torch.tensor([
                vocab[vocab.bos_token],
            ] + vocab[toked]).unsqueeze(0)
            print(input_ids)
            input_ids = input_ids.to(device)
            input_length = input_ids.shape[1]
            outputs = kogpt2model.generate(input_ids=input_ids,
                                           max_length=input_length +
                                           expected_length,
                                           repetition_penalty=1.2,
                                           do_sample=True,
                                           num_beams=3,
                                           bos_token_id=0,
                                           pad_token_id=3,
                                           eos_token_id=1,
                                           num_return_sequences=3)
            for i in range(3):  # 3 output sequences were generated
                toked = vocab.to_tokens(outputs[i].squeeze().tolist())
                ret = re.sub(r'(<s>|</s>|<pad>|<unk>|)', '',
                             ''.join(toked).replace('▁', ' ').strip())
                kogpt_output_sentences.append(ret)
        kogpt_input_sentences = copy.deepcopy(kogpt_output_sentences)
    print(kogpt_input_sentences)