def build_voca_test(): config = get_config() train_json_path = config['caption_train_path'] with open(train_json_path, "r") as f: train_data = json.load(f) voca_root_path = "C:\\Users\\multicampus\\s02p23c104\\Back\\AI\\datasets\\caption\\" build_voca(train_data, 5, voca_root_path)
def attenntion_training_test(): config = get_config() print(config) AI_DIREC = "C:\\Users\\multicampus\\s02p23c104\\Back\\AI" vocab_path = AI_DIREC + config['caption_vocab_path'] word2idx_path = AI_DIREC + config['word2idx_train_path'] image_path = config['caption_train_image_path'] save_path = AI_DIREC + config['checkpoints_saved_path'] print(save_path) caption_path = AI_DIREC + config['caption_train_path'] print(vocab_path) print(word2idx_path) mini_batch_loss, encoder, decoder = attention_caption_train( vocab_path, image_path, config, caption_path, word2idx_path=word2idx_path) datestr = date2str() save_config(config, "attention_config" + datestr, save_path) save_loss(mini_batch_loss, "attention_loss" + datestr, save_path) save_model(encoder, "attention_encoder" + datestr, save_path) save_model(decoder, "attention_decoder" + datestr, save_path)
def caption_test_test(): config = get_config() vocab_path = config['caption_vocab_path'] image_path = config['caption_train_image_path'] encoder_path = config['caption_encoder_path'] decoder_path = config['caption_decoder_path'] caption_path = config['caption_test_path'] config_path = config['config_path'] word2idx_path = config['word2idx_test_path'] batch = 1 max_sequence_len = 30 #test(vocab_path,encoder_path,decoder_path,caption_path,image_path,config_path,batch,max_sequence_len,word2idx_path=None) images, result_captions, original_captions = caption_test( vocab_path, encoder_path, decoder_path, caption_path, image_path, config_path, batch, max_sequence_len, word2idx_path=word2idx_path) print(result_captions) visualize_img_caption(images, result_captions)
def train_procedure_test(): config = get_config() load_path = config['word2idx_train_path'] voca_path = config['caption_vocab_path'] dataset = load_tokenized_data(load_path) voca = load_voca(voca_path) batch_size = 2 embed_size = 10 vocab_len = len(voca) hidden_layer = 1 hidden_size = 10 loader = make_caption_loader(dataset, batch_size, config['caption_train_image_path']) dataiter = iter(loader) images, caption, length = dataiter.next() # data형태 확인하기 print("Data 형태 확인") print(images.size()) print(caption.size()) encoder = EncoderCNN(embed_size) decoder = DecoderRNN(embed_size, vocab_len, hidden_layer, hidden_size) grad_params = list(encoder.linear.parameters()) loss_function = nn.CrossEntropyLoss() optimizer = optim.Adam(params=grad_params, lr=0.001) compare_target = pack_padded_sequence(caption, length, batch_first=True).data feature = encoder(images) output = decoder(caption, feature, length) loss = loss_function(output, compare_target) optimizer.zero_grad() loss.backward() optimizer.step() datestr = date2str() save_path = config['checkpoints_saved_path'] mini_batch_loss = [] mini_batch_loss.append(loss.item()) save_config(config, "config" + datestr, save_path) save_loss(mini_batch_loss, "loss" + datestr, save_path) save_model(encoder, "encoder" + datestr, save_path) save_model(decoder, "decoder" + datestr, save_path) print( "optimzer.zero_grad()와 encoder.zero_grad() , decoder.zero_grad()와 같을 까?" ) print("optimizer.zero_grad() 호출하기 전") print(encoder.linear.weight.grad) print("optimizer.zero_grad() 호출한 후") optimizer.zero_grad() print(encoder.linear.weight.grad) print("====================") print(grad_params)
def save_caption2idx_test(): config = get_config() AI_DIREC = "C:\\Users\\multicampus\\s02p23c104\\Back\\AI" voca_path = AI_DIREC + config['caption_vocab_path'] train_json_path = AI_DIREC + config['caption_train_path'] voca = load_voca(voca_path) dataset = tokenized_data(train_json_path, voca) save_tokenized_data(dataset=dataset, AI_DIREC=AI_DIREC)
def loader_test(): config = get_config() load_path = config['word2idx_test_path'] voca_path = config['caption_vocab_path'] dataset = load_tokenized_data(load_path) print(dataset['image_list']) voca = load_voca(voca_path) loader = make_caption_loader(dataset, 10, config['train_image_path']) dataiter = iter(loader) images, padded_caption, caption_length = dataiter.next() print(images)
def save_tokenized_data(dataset, AI_DIREC, save_path=None, type="train"): if type is "test": name = "wordVector_test" elif type is "val": name = "wordVector_val" else: name = "wordVector_train_5" name += ".pickle" if save_path is None: save_path = AI_DIREC + get_config()['caption_train_word_saved_path'] save_path += name with open(save_path, 'wb') as f: pickle.dump(dataset, f)
def korean_gpt_long_setence_life_test(): config = get_config() kogpt2_config = get_kog_config() kogpt2_model_path = "C:\\Users\\multicampus\\s02p23c104\\Back\\AI\\checkpoints\\kogpt_life_model_20_2020-04-26-23-56-31.pth" kogpt2_vocab_path = config['kogpt_vocab_path'] kogpt2model = GPT2LMHeadModel(config=GPT2Config.from_dict(kogpt2_config)) torch.load(kogpt2_model_path) kogpt2model.load_state_dict(torch.load(kogpt2_model_path)) device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") kogpt2model.to(device) kogpt2model.eval() vocab = nlp.vocab.BERTVocab.from_sentencepiece(kogpt2_vocab_path, mask_token=None, sep_token=None, cls_token=None, unknown_token='<unk>', padding_token='<pad>', bos_token='<s>', eos_token='</s>') tok = SentencepieceTokenizer(kogpt2_vocab_path) sent = '나는 밥을 먹었' toked = tok(sent) print(toked) sent_cnt = 0 input_ids = torch.tensor([ vocab[vocab.bos_token], ] + vocab[toked]).unsqueeze(0) input_ids = input_ids.to(device) outputs = kogpt2model.generate(input_ids=input_ids, max_length=100, min_length=50, repetition_penalty=1.2, do_sample=True, num_beams=3, bos_token_id=0, pad_token_id=3, eos_token_id=1, num_return_sequences=3) target = outputs[0] print("========수필===========") for i in range(3): # 3 output sequences were generated toked = vocab.to_tokens(outputs[i].squeeze().tolist()) ret = re.sub(r'(<s>|</s>|<pad>|<unk>)', '', ''.join(toked).replace('▁', ' ').strip()) print('Generated {}: {}'.format(i, ret))
def fine_tune_test(): AI_DIRECTORY = "C:\\Users\\multicampus\\yye\\s02p23c104\\Back\\AI" #AI_DIRECTORY = "C:\\Users\\multicampus\\s02p23c104\\Back\\AI" config = get_config() fine_tune_num = 0 new_kogpt_model, loss_record = fine_tuning(config, fine_tune_num, AI_DIRECTORY) epoch = config['kogpt_epoch'] datestr = date2str() default_name = "kogpt_story_" model_name = default_name + "model_" + str(epoch) + "_" + datestr loss_name = default_name + "loss_" + datestr root_path = AI_DIRECTORY + config['checkpoints_saved_path'] save_model(new_kogpt_model, model_name, root_path) save_loss(loss_record, loss_name, root_path)
def build_korean_to_idx_test(): config = get_config() #AI_DIRECTORY = "C:\\Users\\multicampus\\yye\\s02p23c104\\Back\\AI" AI_DIRECTORY = "C:\\Users\\multicampus\\s02p23c104\\Back\\AI" kogpt2_vocab_path = AI_DIRECTORY + config['kogpt_vocab_path'] vocab = nlp.vocab.BERTVocab.from_sentencepiece(kogpt2_vocab_path, mask_token=None, sep_token=None, cls_token=None, unknown_token='<unk>', padding_token='<pad>', bos_token='<s>', eos_token='</s>') tok = SentencepieceTokenizer(kogpt2_vocab_path) file_path = AI_DIRECTORY + "\\datasets\\kogpt\\story_train_pkl.pkl" save_path = AI_DIRECTORY + "\\datasets\\kogpt\\" build_korean_to_idx(file_path, save_path, vocab, tok, block_size=256)
def dataset_split_save(path): data = get_path_caption(path) data_keys = list(data.keys()) total_size = len(data) cfg = get_config() src = cfg['image_path'] dst_train = src+'train\\' # total에서 6:2:2로 train ,validation, function_test data를 나눌 것입니다. print(src) print(dst_train) train_ratio = 0.6 train_size = int(total_size*train_ratio) train_data_keys = data_keys[:train_size] train_data = dict() for image_path in train_data_keys: shutil.move(src+image_path,dst_train+image_path) train_data[image_path] = data[image_path] dst_val = src+'val\\' val_ratio = 0.2 val_size = int(total_size*val_ratio) val_data_keys = data_keys[train_size:train_size+val_size] val_data = dict() for image_path in val_data_keys: shutil.move(src+image_path,dst_val+image_path) val_data[image_path] = data[image_path] dst_test = src+'function_test\\' test_data_keys = data_keys[train_size+val_size:] test_data = dict() for image_path in test_data_keys: shutil.move(src+image_path,dst_test+image_path) test_data[image_path] = data[image_path] with open("../datasets/train_data.json", 'w', encoding='utf-8') as train_json: json.dump(train_data, train_json) with open("../datasets/val_data.json", 'w', encoding='utf-8') as val_json: json.dump(val_data, val_json) with open("../datasets/test_data.json", 'w', encoding='utf-8') as test_json: json.dump(test_data, test_json) return train_data,val_data,test_data
def caption_train_test(): config = get_config() print(config) vocab_path = config['caption_vocab_path'] word2idx_path = config['word2idx_train_path'] image_path = config['caption_train_image_path'] save_path = config['checkpoints_saved_path'] caption_path = config['caption_train_path'] mini_batch_loss, encoder, decoder = caption_train( vocab_path, image_path, config, caption_path, word2idx_path=word2idx_path) datestr = date2str() save_config(config, "config" + datestr, save_path) save_loss(mini_batch_loss, "loss" + datestr, save_path) save_model(encoder, "encoder" + datestr, save_path) save_model(decoder, "decoder" + datestr, save_path)
def all_datasets_split(path): images, captions = get_imagepath_caption(path) images_size = len(images) cfg = get_config() src = cfg['origin'] dst_train = src + 'train\\' train_ratio = 0.6 train_size = int(images_size * train_ratio) train_data_images = images[:train_size] train_data = {} for idx, image in enumerate(train_data_images): shutil.move(src + image, dst_train + image) train_data[image] = captions[idx:idx+5] dst_val = src + 'validation\\' val_ratio = 0.2 val_size = int(images_size * val_ratio) val_data_keys = images[train_size:train_size + val_size] val_data = dict() for idx, image_path in enumerate(val_data_keys): shutil.move(src + image_path, dst_val + image_path) val_data[image_path] = captions[train_size + idx:train_size + idx+5] dst_test = src + 'function_test\\' test_data_keys = images[train_size + val_size:] test_data = dict() for idx, image_path in enumerate(test_data_keys): shutil.move(src + image_path, dst_test + image_path) test_data[image_path] = captions[train_size + val_size + idx : train_size + val_size + idx + 5] with open("../datasets/train_data_yyejej.json", 'w', encoding='utf-8') as train_json: json.dump(train_data, train_json) with open("../datasets/val_data_yyejej.json", 'w', encoding='utf-8') as val_json: json.dump(val_data, val_json) with open("../datasets/test_data_yyejej.json", 'w', encoding='utf-8') as test_json: json.dump(test_data, test_json) return train_data, val_data, test_data
def kogpt_test(): config = get_config() tok_path = get_tokenizer() model, vocab = get_pytorch_kogpt2_model() tok = SentencepieceTokenizer(tok_path) sent = '나는 밥을 먹었' toked = tok(sent) input_ids = torch.tensor([ vocab[vocab.bos_token], ] + vocab[toked]).unsqueeze(0) output = model(input_ids=input_ids) while 1: input_ids = torch.tensor([ vocab[vocab.bos_token], ] + vocab[toked]).unsqueeze(0) pred = model(input_ids)[0] gen = vocab.to_tokens(torch.argmax(pred, axis=-1).squeeze().tolist())[-1] print(gen) if gen == '</s>': break sent += gen.replace('▁', ' ') toked = tok(sent) print(sent)
def caption_test(vocab_path, encoder_path, decoder_path, caption_path, image_path, config_path, batch, max_sequence_len, word2idx_path=None): vocab = load_voca(vocab_path) cfg = get_config(config_path) embed_size = cfg['caption_embed_size'] vocab_size = len(vocab) hidden_layers_num = cfg['caption_hidden_layer'] hidden_size = cfg['caption_hidden_size'] if word2idx_path is not None: dataset = load_tokenized_data(word2idx_path) else: dataset = tokenized_data(caption_path, vocab, type="test") save_tokenized_data(dataset, type="test") encoder = EncoderCNN(embed_size) decoder = DecoderRNN(embed_size, vocab_size, hidden_layers_num, hidden_size) encoder.load_state_dict(torch.load(encoder_path)) decoder.load_state_dict(torch.load(decoder_path)) encoder.eval() decoder.eval() loader = make_caption_loader(dataset, batch, image_path) test_data_iter = iter(loader) images, captions, length = test_data_iter.next() device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") device_images = images.to(device) features = encoder(images) states = None # features의 형태는 (batch,embed_size)인 2차원입니다. 그러나 # 이후 사용될 lstm은 input으로 (batch,num of embeddings,embed_size) 3차원 형태를 요구하기 때문에 # features의 차원을 강제로 늘려줍니다. lstm_inputs = features.unsqueeze(1) predicted_index = [] for i in range(max_sequence_len): outputs, states = decoder.lstm(lstm_inputs, states) # outputs을 linear 레이어의 인풋을 위해 2차원 배열로 만들어 줘야함 outputs = outputs.squeeze(1) scores_per_batch = decoder.score_layer(outputs) values, predicted = scores_per_batch.max(1) predicted_index.append(predicted) lstm_inputs = decoder.embed(predicted) lstm_inputs = lstm_inputs.unsqueeze(1) # tensor를 포함한 그냥 1차원 짜리 리스트 [batch * max_sequence_len] => 2차원의 매트릭스 [batch X max_sequence_len] 바꿔줘야 함 # ex) # predicted_index = [tensor([0,3,6]),tensor([1,4,7]),tensor([2,5,8])] # 이걸 # [0,1,2] # [3,4,5] # [6,7,8] 이렇게 바꿔줘야 함 # 2차원 짜리를 만들건데 기존의 리스트는 dim 0 방향이 되고(세로방향) # 새로 붙이는 리스트는 dim 1 방향으로 붙여야 함(가로 방향) predicted_index = torch.stack(predicted_index, dim=1) # 현재 tensor가 gpu에 있으므로 cpu로 옮겨서 연산을 해야함. predicted_index = predicted_index.cpu().numpy() result_captions = [] for wordindices in predicted_index: caption = [] for index in wordindices: word = vocab.idx2word[index] if word == '<end>': break if word == '<unk>' or word == '<start>': continue caption.append(word) result_captions.append(caption) return images, result_captions, captions
def predict(images,root_path,AI_directory_path,model_type="life"): config = get_config() #0. Extract captions from images vocab = load_voca(AI_directory_path+config['caption_vocab_path']) caption_embed_size = config['caption_embed_size'] caption_hidden_layer = config['caption_hidden_layer'] caption_hidden_size = config['caption_hidden_size'] caption_encoder_path = AI_directory_path+config['caption_encoder_path'] caption_decoder_path = AI_directory_path+config['caption_decoder_path'] device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") max_sequence_len = 30 #default value transform = torch_transform.Compose([ torch_transform.ToTensor(), torch_transform.Normalize(mean=(0.4444, 0.4215, 0.3833), std=(0.2738, 0.2664, 0.2766))]) encoder = EncoderCNN(caption_embed_size) decoder = DecoderRNN(caption_embed_size, len(vocab), caption_hidden_layer, caption_hidden_size) encoder.load_state_dict(torch.load(caption_encoder_path,map_location=device)) decoder.load_state_dict(torch.load(caption_decoder_path,map_location=device)) images = load_image(images, root_path, transform) encoder.eval() decoder.eval() encoder.to(device) decoder.to(device) images = images.to(device) features = encoder(images) states = None predicted_index = [] lstm_inputs = features.unsqueeze(1) for i in range(max_sequence_len): outputs,states = decoder.lstm(lstm_inputs,states) # outputs을 linear 레이어의 인풋을 위해 2차원 배열로 만들어 줘야함 outputs = outputs.squeeze(1) scores_per_batch = decoder.score_layer(outputs) values, predicted = scores_per_batch.max(1) predicted_index.append(predicted) lstm_inputs = decoder.embed(predicted) lstm_inputs = lstm_inputs.unsqueeze(1) predicted_index = torch.stack(predicted_index,dim=1) predicted_index = predicted_index.cpu().numpy() result_captions = [] for wordindices in predicted_index: text ="" for index in wordindices: word = vocab.idx2word[index] if word == '<end>': break if word == '<unk>' or word == '<start>': continue text += word + " " result_captions.append(text) print("result_caption : ",result_captions) # 1. translate captions to korean korean_sentences = [] for sent in result_captions: translate_result = get_translate(sent) if translate_result != -1: translate_result = re.sub(r'\.','',translate_result) korean_sentences.append(translate_result) print("result_korean : ",korean_sentences) kogpt2_config = get_kog_config() if model_type == "life": kogpt2_model_path = AI_directory_path+config['kogpt_life_model_path'] elif model_type == "story": kogpt2_model_path = AI_directory_path + config['kogpt_story_model_path'] else: kogpt2_model_path = AI_directory_path+config['kogpt_model_path'] kogpt2_vocab_path = AI_directory_path+config['kogpt_vocab_path'] kogpt2model = GPT2LMHeadModel(config=GPT2Config.from_dict(kogpt2_config)) kogpt2model.load_state_dict(torch.load(kogpt2_model_path,map_location=device)) kogpt2model.to(device) kogpt2model.eval() vocab = nlp.vocab.BERTVocab.from_sentencepiece(kogpt2_vocab_path, mask_token=None, sep_token=None, cls_token=None, unknown_token='<unk>', padding_token='<pad>', bos_token='<s>', eos_token='</s>') tok = SentencepieceTokenizer(kogpt2_vocab_path) korean_preprocess(korean_sentences) gpt_result = naive_prediction(korean_sentences,tok,vocab,device,kogpt2model,model_type) korean_postprocess(gpt_result) result = [] make_sentence(gpt_result,"",result,0) result.sort(key=lambda item: (-len(item),item)) result_len = len(result) if result_len >11: result_len = 11 result = result[1:result_len] return result
def attention_beam_search_test(images, root_path): config = get_config() # 0. Extract captions from images AI_directory_path = "C:\\Users\\multicampus\\s02p23c104\\Back\\AI" vocab = load_voca(AI_directory_path + config['caption_attention_vocab_path']) emb_dim = config['caption_embed_size'] decoder_dim = config['caption_hidden_size'] attention_dim = config['caption_attention_dim'] dropout = config['caption_dropout_ratio'] caption_encoder_path = AI_directory_path + config[ 'caption_attention_encoder_path'] caption_decoder_path = AI_directory_path + config[ 'caption_attention_decoder_path'] device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") max_sequence_len = 50 # default value transform = torch_transform.Compose([ torch_transform.ToTensor(), torch_transform.Normalize(mean=(0.4444, 0.4215, 0.3833), std=(0.2738, 0.2664, 0.2766)) ]) encoder = Encoder() decoder = DecoderWithAttention(attention_dim=attention_dim, embed_dim=emb_dim, decoder_dim=decoder_dim, vocab_size=len(vocab), dropout=dropout) encoder.load_state_dict( torch.load(caption_encoder_path, map_location=device)) decoder.load_state_dict( torch.load(caption_decoder_path, map_location=device)) images = load_image(images, root_path, transform) encoder.eval() decoder.eval() encoder.to(device) decoder.to(device) images = images.to(device) batch = images.shape[0] predicted_index = [] encoder_out = encoder( images) # (1, enc_image_size, enc_image_size, encoder_dim) enc_image_size = encoder_out.size(1) encoder_dim = encoder_out.size(3) # Flatten encoding encoder_out = encoder_out.view(batch, -1, encoder_dim) # (1, num_pixels, encoder_dim) num_pixels = encoder_out.size(1) k_prev_words = torch.LongTensor([[vocab('<start>')]] * batch).to(device) h, c = decoder.init_hidden_state(encoder_out) for i in range(max_sequence_len): embeddings = decoder.embedding(k_prev_words).squeeze( 1) # (s, embed_dim) awe, _ = decoder.attention(encoder_out, h) # (s, encoder_dim), (s, num_pixels) gate = decoder.sigmoid( decoder.f_beta(h)) # gating scalar, (s, encoder_dim) awe = gate * awe h, c = decoder.decode_step(torch.cat([embeddings, awe], dim=1), (h, c)) # (s, decoder_dim) scores = decoder.fc(h) # (s, vocab_size) _, predicted = scores.max(1) predicted_index.append(predicted) k_prev_words = predicted.unsqueeze(1) predicted_index = torch.stack(predicted_index, dim=1) predicted_index = predicted_index.cpu().numpy() result_captions = [] for wordindices in predicted_index: text = "" for index in wordindices: word = vocab.idx2word[index] if word == '<end>': break if word == '<unk>' or word == '<start>': continue text += word + " " result_captions.append(text) print("result_caption : ", result_captions)
def kogpt_life_recursive_test(): config = get_config() AI_directory_path = "C:\\Users\\multicampus\\s02p23c104\\Back\\AI" kogpt2_config = get_kog_config() kogpt2_model_path = "C:\\Users\\multicampus\\s02p23c104\\Back\\AI\\checkpoints\\kogpt_life_model_20_2020-04-26-23-56-31.pth" kogpt2_vocab_path = AI_directory_path + config['kogpt_vocab_path'] kogpt2model = GPT2LMHeadModel(config=GPT2Config.from_dict(kogpt2_config)) torch.load(kogpt2_model_path) kogpt2model.load_state_dict(torch.load(kogpt2_model_path)) device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") kogpt2model.to(device) kogpt2model.eval() vocab = nlp.vocab.BERTVocab.from_sentencepiece(kogpt2_vocab_path, mask_token=None, sep_token=None, cls_token=None, unknown_token='<unk>', padding_token='<pad>', bos_token='<s>', eos_token='</s>') tok = SentencepieceTokenizer(kogpt2_vocab_path) # sent = ' 신랑 신부가 결혼식 파티 앞에서 사진을 찍기 위해 포즈를 취하고 있자, 신부님은 웨딩드레파란 셔츠를 입은 남자가 사다리에 서 있' # toked = tok(sent) # print(toked) # sent_cnt = 0 # input_ids = torch.tensor([vocab[vocab.bos_token], ] + vocab[toked]).unsqueeze(0) # input_ids = input_ids.to(device) korean_sentences = [ '신랑 신부가 결혼식 파티 앞에서 사진을 찍기 위해 포즈를 취하고 있다.', '파란 셔츠를 입은 남자가 사다리에 서 있다.', '두 남자가 서 있다' ] kogpt_input_sentences = [] for korean in korean_sentences: korean_size = len(korean) if not kogpt_input_sentences: korean_size = len(korean) if korean_size > 3: kogpt_input_sentences.append(korean[:-2]) elif korean_size > 1: kogpt_input_sentences.append(korean[:-1]) else: kogpt_input_sentences.append(korean) else: for i in range(len(kogpt_input_sentences)): if korean_size > 3: kogpt_input_sentences[i] += korean[:-2] elif korean_size > 1: kogpt_input_sentences[i] += korean[:-1] else: kogpt_input_sentences[i] += korean[:] kogpt_output_sentences = [] print(kogpt_input_sentences) expected_length = 50 for kogpt_input_sentence in kogpt_input_sentences: print(kogpt_input_sentence) toked = tok(kogpt_input_sentence) input_ids = torch.tensor([ vocab[vocab.bos_token], ] + vocab[toked]).unsqueeze(0) print(input_ids) input_ids = input_ids.to(device) input_length = input_ids.shape[1] outputs = kogpt2model.generate(input_ids=input_ids, max_length=input_length + expected_length, repetition_penalty=1.2, do_sample=True, num_beams=3, bos_token_id=0, pad_token_id=3, eos_token_id=1, num_return_sequences=3) for i in range(3): # 3 output sequences were generated toked = vocab.to_tokens(outputs[i].squeeze().tolist()) ret = re.sub(r'(<s>|</s>|<pad>|<unk>|)', '', ''.join(toked).replace('▁', ' ').strip()) kogpt_output_sentences.append(ret) kogpt_input_sentences = copy.deepcopy(kogpt_output_sentences) print(kogpt_input_sentences)