def __init__(self, hparams, **kwargs): super(KoGPT2Chat, self).__init__() self.hparams = hparams self.tok_path = get_tokenizer() self.neg = -1e18 self.kogpt2, self.vocab = get_pytorch_kogpt2_model() self.loss_function = torch.nn.CrossEntropyLoss(reduction='none')
def sentencePieceTokenizer(): tok_path = get_tokenizer() sentencepieceTokenizer = SentencepieceTokenizer(tok_path, num_best=0, alpha=0) return sentencepieceTokenizer
def chat(model_params, sent='0'): tok_path = get_tokenizer() model, vocab = get_mxnet_kogpt2_model(ctx=ctx) tok = SentencepieceTokenizer(tok_path, num_best=0, alpha=0) kogptqa = KoGPT2Chat(model) kogptqa.load_parameters(model_params, ctx=ctx) sent_tokens = tok(sent) while 1: q = input('user > ').strip() if q == 'quit': break q_tok = tok(q) a = '' a_tok = [] while 1: input_ids = mx.nd.array([vocab[U_TKN]] + vocab[q_tok] + vocab[EOS, SENT] + vocab[sent_tokens] + vocab[EOS, S_TKN] + vocab[a_tok]).expand_dims(axis=0) pred = kogptqa(input_ids.as_in_context(ctx)) gen = vocab.to_tokens( mx.nd.argmax( pred, axis=-1).squeeze().astype('int').asnumpy().tolist())[-1] if gen == EOS: break a += gen.replace('▁', ' ') a_tok = tok(a) print("Simsimi > {}".format(a.strip()))
def chat(kogptqa, sent='0'): tok_path = get_tokenizer() _, vocab = get_pytorch_kogpt2_model() tok = SentencepieceTokenizer(tok_path, num_best=0, alpha=0) sent_tokens = tok(sent) with torch.no_grad(): while 1: q = input('user > ').strip() if q == 'quit': break q_tok = tok(q) a = '' a_tok = [] while 1: input_ids = torch.LongTensor([ vocab[U_TKN]] + vocab[q_tok] + vocab[EOS, SENT] + vocab[sent_tokens] + vocab[EOS, S_TKN] + vocab[a_tok]).unsqueeze(dim=0) pred = kogptqa(input_ids) gen = vocab.to_tokens( torch.argmax( pred, dim=-1).squeeze().numpy().tolist())[-1] if gen == EOS: break a += gen.replace('▁', ' ') a_tok = tok(a) print("Simsimi > {}".format(a.strip()))
def Tokenizer(item): item = list(np.array(item.tolist())) max = 0 tok_path = get_tokenizer() model, vocab = get_pytorch_kogpt2_model() tok = SentencepieceTokenizer(tok_path, num_best=0, alpha=0) out = [] for i in item: toked = tok(i) input_ids = torch.tensor([ vocab[vocab.bos_token], ] + vocab[toked]).unsqueeze(0) size = input_ids.shape # print(input_ids) # print(input_ids.shape) y = torch.cat( [input_ids, torch.empty(1, max_seqlen - size[1])], axis=1) out = torch.cat([out, y], axis=0) print(out.shape) x_np = out.numpy() x_df = pd.DataFrame(x_np) x_df.to_csv('./data/encoded.csv', mode='w')
def __init__(self, hparams, **kwargs): super(KoGPT2Chat, self).__init__() self.hparams = hparams self.tok_path = get_tokenizer() self.neg = -1e18 self.kogpt2, self.vocab = get_pytorch_kogpt2_model("cuda") self.loss_function = torch.nn.CrossEntropyLoss(reduction='none') self.max_gpu_load_train = 0 self.max_memory_used_train = 0.0
def __init__(self, max_len=32, batch_size=64, lr=5e-5, num_epochs=1): super(KoGPT2Chat, self).__init__() self.batch_size = batch_size self.lr = lr self.max_len = max_len self.tok_path = get_tokenizer() self.num_epochs = num_epochs self.neg = -1e18 self.kogpt2, self.vocab = get_pytorch_kogpt2_model() self.loss_function = torch.nn.CrossEntropyLoss(reduction='none')
def __init__(self, hparams, **kwargs): super(KoGPT2Chat, self).__init__() self.hparams = hparams # hparams에 args정보 들어감 self.tok_path = get_tokenizer() self.neg = -1e18 self.kogpt2, self.vocab = get_pytorch_kogpt2_model( ) # 모델이랑 단어 사전 두개로 받아준다 self.loss_function = torch.nn.CrossEntropyLoss( reduction='none' ) # 손실함수는 CrossEntropyLoss : 분류 모델(label(정답값)과 gpt2의 아웃풋(원핫인코딩))
def __init__(self): self.PAD_IDX = 0 self.UNK_IDX = 1 self.PAD_TOKEN = 'PAD_TOKEN' self.UNK_TOKEN = 'UNK_TOKEN' self.tok=Mecab() _, self.vocab = get_pytorch_kogpt2_model() self.tok_path = get_tokenizer() self.tok2 = SentencepieceTokenizer(self.tok_path, num_best=0, alpha=0)
def __init__(self, vocab, MAX_LEN=32): self.DATA = pd.read_csv('./TK_data/Chatbot_data/ChatbotData.csv') self._tok_path = get_tokenizer() self.tokenizer = None self.first = True self.q_token = U_TKN # BOS os Q self.a_token = S_TKN # BOS os A self.sent_token = SENT self.bos = BOS self.eos = EOS self.maskt = MASK self.vocab = vocab self.MAX_LEN = MAX_LEN self.padder = nlp.data.PadSequence( MAX_LEN, pad_val=self.vocab[self.vocab.padding_token])
def Load_Model(): global vocab_global global sent_tokens_global global kogptqa_global global tok_global tok_path = get_tokenizer() model, vocab = get_mxnet_kogpt2_model(ctx=ctx) tok = SentencepieceTokenizer(tok_path, num_best=0, alpha=0) kogptqa = KoGPT2Chat(model) kogptqa.load_parameters("KoGPT2-chatbot\kogpt2_chat.params", ctx=ctx) sent_tokens = tok("0") vocab_global = vocab sent_tokens_global = sent_tokens kogptqa_global = kogptqa tok_global = tok
def dataset(file_path): data = [] tokenizer = SentencepieceTokenizer(get_tokenizer()) f = open(file_path, 'r', encoding='utf-8') while True: file = f.readline() if not file: break line = tokenizer(file[:-1]) indexing_word = [vocab[vocab.bos_token] ] + vocab[line] + [vocab[vocab.eos_token]] data.append(indexing_word) f.close() return data
def __init__(self, load_path): ctx = "cuda" cachedir = "~/kogpt2/" org_path = "trained_models/gpt2_j20_1007.pt" # download vocab vocab_info = tokenizer vocab_path = download( vocab_info["url"], vocab_info["fname"], vocab_info["chksum"], cachedir=cachedir, ) # Device 설정 device = torch.device(ctx) # 저장한 Checkpoint 불러오기 checkpoint = torch.load(load_path, map_location=device) # 1013: special token 학습한 뒤로 keys 값이 달라져서 이와 같은 작업 필요 checkpoint_org = torch.load(org_path, map_location=device) ckpt_final = { k: v for k, v in zip(checkpoint_org.keys(), checkpoint.values()) } # 원래 state_dict 에 value 를 새로운 학습 결과로 바꿔줌 # KoGPT-2 언어 모델 학습을 위한 GPT2LMHeadModel 선언 self.kogpt2model = GPT2LMHeadModel(config=GPT2Config.from_dict(kogpt2_config)) self.kogpt2model.load_state_dict(ckpt_final) self.kogpt2model.to(device) self.kogpt2model.eval() self.vocab = gluonnlp.vocab.BERTVocab.from_sentencepiece( vocab_path, mask_token=None, sep_token=None, cls_token=None, unknown_token="<unk>", padding_token="<pad>", bos_token="<s>", eos_token="</s>", ) tok_path = get_tokenizer() self.tok = SentencepieceTokenizer(tok_path)
def __init__(self, vocab, MAX_LEN=2048): self.q_token = U_TKN # BOS os Q self.a_token = S_TKN # BOS os A self.bos = BOS self.eos = EOS self.maskt = MASK self.sent_token = SENT #----------------------------------- self.folder_path = "./TK_data/T0_data" self.DATA_PATH = [] self.DATA_PATH_IDX = [] self.DATA_PATH_LEN = [] self.previous_context = None self.MAX_LEN = MAX_LEN #self.DATA = pd.read_csv('./TK_data/Chatbot_data/ChatbotData.csv') self._tok_path = get_tokenizer() self.tokenizer = None self.first = True self.vocab = vocab self.padder = nlp.data.PadSequence( MAX_LEN, pad_val=self.vocab[self.vocab.padding_token]) TEMP_MAX = 0 INDEX = 0 for file_path in glob.glob(self.folder_path + "/*.txt"): self.DATA_PATH.append(file_path) file = open(file_path, 'r', encoding='utf-8') data = file.readline() DATA_LEN = 1 while True: data = file.readline() DATA_LEN += 1 #if not line: # break self.DATA_PATH_IDX.append(INDEX) self.DATA_PATH_LEN.append(DATA_LEN) INDEX += 1
def sentence_generation(random_tok, model_, vocab_): tok_path = get_tokenizer() tok = SentencepieceTokenizer(tok_path, num_best=0, alpha=0) n = 0 sent = '' while n < 30: if n == 0: input_ids = torch.tensor([ vocab_[vocab_.bos_token], ] + [random_tok]).unsqueeze(0) else: input_ids = torch.tensor([ vocab_[vocab_.bos_token], ] + vocab_[toked]).unsqueeze(0) pred = model_(input_ids)[0] gen = vocab_.to_tokens(torch.argmax(pred, axis=-1).squeeze().tolist())[-1] if gen == '</s>': break sent += gen.replace('▁', ' ') toked = tok(sent) n += 1 return sent
device = torch.device(ctx) kogpt2model.to(device) # Fine Tunning을 위해 train 선언 kogpt2model.train() # 단어 뭉치 가져오기 vocab_b_obj = nlp.vocab.BERTVocab.from_sentencepiece(vocab_path, mask_token=None, sep_token=None, cls_token=None, unknown_token='<unk>', padding_token='<pad>', bos_token='<s>', eos_token='</s>') ########################################################################################## tok_path = get_tokenizer() vocab = vocab_b_obj sentencepieceTokenizer = SentencepieceTokenizer(tok_path) print("데이터 로드") data_file_path = 'Data_crawler/dataset/삼성전자_pred/pre_삼성전자_연합인포맥스.json' news_data = GPT_Dataset_Train(data_file_path) news_dataset = GPTDataset( news_data, vocab, sentencepieceTokenizer) # Torch DataLoader 형태 맞춰주는 Dataset 설정 news_data_loader = DataLoader(news_dataset, batch_size=4, shuffle=True, pin_memory=True,
def run(): parser = ArgumentParser() parser.add_argument( "--dataset_path", type=str, default="", help="Path or url of the dataset. If empty download from S3.") parser.add_argument("--use_adapter", default=False, action='store_true', help="Use adapter or not") parser.add_argument("--keyword_module", type=str, default="", help="add, attention, ") parser.add_argument( "--model", type=str, default="openai-gpt", help="Model type (openai-gpt or gpt2)", choices=['openai-gpt', 'gpt2']) # anything besides gpt2 will load openai-gpt parser.add_argument("--model_checkpoint", type=str, default="", help="Path, url or short name of the model") parser.add_argument("--device", type=str, default="cuda" if torch.cuda.is_available() else "cpu", help="Device (cuda or cpu)") parser.add_argument("--bert_model_path", default="./", type=str, help="Bert pre-trained model path") parser.add_argument( "--vocab_file", default="./vocab.korean.rawtext.list", type=str, help="The vocabulary file that the BERT model was trained on.") parser.add_argument("--no_sample", action='store_true', help="Set to use greedy decoding instead of sampling") parser.add_argument("--max_length", type=int, default=50, help="Maximum length of the output utterances") parser.add_argument("--min_length", type=int, default=1, help="Minimum length of the output utterances") parser.add_argument("--seed", type=int, default=0, help="Seed") parser.add_argument("--temperature", type=int, default=0.7, help="Sampling softmax temperature") parser.add_argument( "--top_k", type=int, default=50, help="Filter top-k tokens before sampling (<=0: no filtering)") parser.add_argument( "--top_p", type=float, default=0.9, help="Nucleus filtering (top-p) before sampling (<=0.0: no filtering)") parser.add_argument( "--do_lower_case", action='store_true', help="Set this flag if you are using an uncased model.") args = parser.parse_args() logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__file__) logger.info(pformat(args)) if args.model_checkpoint == "": if args.model == 'gpt2': raise ValueError( "Interacting with GPT2 requires passing a finetuned model_checkpoint" ) else: args.model_checkpoint = download_pretrained_model() if args.seed != 0: random.seed(args.seed) torch.random.manual_seed(args.seed) torch.cuda.manual_seed(args.seed) logger.info("Get pretrained model and tokenizer") # Load KoBERT model and tokenizer bert_tokenizer = BertTokenizer.from_pretrained( args.vocab_file, do_lower_case=args.do_lower_case) bert_model = BertModel.from_pretrained(args.bert_model_path) bert_model.to(args.device) bert_model.eval() # Load KoGPT2 model and tokenizer tok_path = get_tokenizer() gpt_model, gpt_vocab = get_pytorch_conkogpt2_model2( use_adapter=args.use_adapter) gpt_tokenizer = SentencepieceTokenizer(tok_path) gpt_model.to(args.device) gpt_model.eval() model = Seq2Seq(bert_model, gpt_model, gpt_vocab, args) model.load_state_dict(torch.load(args.model_checkpoint), strict=False) model.to(args.device) model.eval() logger.info("Load test data") sourceList, targetList = get_test_dataset(bert_tokenizer, gpt_tokenizer, gpt_vocab, args.dataset_path) f1 = open((args.model_checkpoint + "_output.txt"), 'w') for line in zip(sourceList, targetList): out_ids = sample_sequence(line[0], bert_model, bert_tokenizer, gpt_model, gpt_vocab, args) out_texts = gpt_vocab.to_tokens(out_ids) for text in out_texts: f1.write(text.replace('▁', ' ').replace('</s>', ' ')) """ for id in out_ids: f1.write(str(id)) f1.write(' ') """ f1.write("\n") f1.close()
def main(temperature=0.7, top_p=0.8, top_k=40, tmp_sent="", text_size=100, loops=-1, load_path='./checkpoint/KoGPT2_checkpoint_long.tar', ctx='cuda', cachedir='~/kogpt2/', samples="./gdrive/My Drive/KoGPT2-FineTuning_pre/samples/"): pytorch_kogpt2 = { 'url': 'https://kobert.blob.core.windows.net/models/kogpt2/pytorch/pytorch_kogpt2_676e9bcfa7.params', 'fname': 'pytorch_kogpt2_676e9bcfa7.params', 'chksum': '676e9bcfa7' } kogpt2_config = { "initializer_range": 0.02, "layer_norm_epsilon": 1e-05, "n_ctx": 1024, "n_embd": 768, "n_head": 12, "n_layer": 12, "n_positions": 1024, "vocab_size": 50000 } model_info = pytorch_kogpt2 model_path = download(model_info['url'], model_info['fname'], model_info['chksum'], cachedir=cachedir) vocab_info = tokenizer vocab_path = download(vocab_info['url'], vocab_info['fname'], vocab_info['chksum'], cachedir=cachedir) device = torch.device(ctx) # 저장한 Checkpoint 불러오기 checkpoint = torch.load(load_path, map_location=device) # KoGPT-2 언어 모델 학습을 위한 GPT2LMHeadModel 선언 kogpt2model = GPT2LMHeadModel(config=GPT2Config.from_dict(kogpt2_config)) kogpt2model.load_state_dict(checkpoint['model_state_dict']) kogpt2model.eval() vocab_b_obj = gluonnlp.vocab.BERTVocab.from_sentencepiece( vocab_path, mask_token=None, sep_token=None, cls_token=None, unknown_token='<unk>', padding_token='<pad>', bos_token='<s>', eos_token='</s>') tok_path = get_tokenizer() model, vocab = kogpt2model, vocab_b_obj tok = SentencepieceTokenizer(tok_path) num = 0 if loops: num = 1 else: num = 0 try: load_path.split("/")[-2] except: pass else: load_path = load_path.split("/")[-2] print("weight load - ", load_path) while 1: sent = '' if tmp_sent == "": tmp_sent = input('input : ') sent = sent + tmp_sent toked = tok(sent) if len(toked) > 1022: break # 실제 생성 코드 top_x 상위 x개 만 사전에서 가져오기 sent = sample_sequence(model, tok, vocab, sent, text_size, temperature, top_p, top_k) sent = sent.replace("//", "\n") # 비효율적이지만 엔터를 위해서 등장 sent = sent.replace("</s>", "") sent = auto_enter(sent) print(sent) # output now = [int(n) for n in os.listdir(samples + load_path)] try: now = max(now) except: now = 1 # f = open(samples + load_path + "/" + str(now + 1), 'w', encoding="utf-8") # head = [load_path, tmp_sent, text_size, temperature, top_p, top_k] # head = [str(h) for h in head] # f.write(",".join(head)) # f.write(",") # f.write(sent) # f.close() #tmp_sent = "" if num != 0: num += 1 if num >= loops: print("good") return
def sentencePieceTokenizer(): tok_path = get_tokenizer() sentencepieceTokenizer = SentencepieceTokenizer(tok_path) return sentencepieceTokenizer
def get_model_result(tmp_sent): ### 1. koGPT2 Config ctx= 'cpu'#'cuda' #'cpu' #학습 Device CPU or GPU. colab의 경우 GPU 사용 cachedir='~/nlp/' # KoGPT-2 모델 다운로드 경로 epoch =200 # 학습 epoch save_path = './checkpoint' load_path = './checkpoint/narrativeKoGPT2_checkpoint.tar' #use_cuda = True # Colab내 GPU 사용을 위한 값 pytorch_kogpt2 = { 'url': 'https://kobert.blob.core.windows.net/models/kogpt2/pytorch/pytorch_kogpt2_676e9bcfa7.params', 'fname': 'pytorch_kogpt2_676e9bcfa7.params', 'chksum': '676e9bcfa7' } kogpt2_config = { "initializer_range": 0.02, "layer_norm_epsilon": 1e-05, "n_ctx": 1024, "n_embd": 768, "n_head": 12, "n_layer": 12, "n_positions": 1024, "vocab_size": 50000 } ### 2. Vocab 불러오기 # download vocab vocab_info = tokenizer vocab_path = download(vocab_info['url'], vocab_info['fname'], vocab_info['chksum'], cachedir=cachedir) ### 3. 체크포인트 및 디바이스 설정 # Device 설정 device = torch.device(ctx) # 저장한 Checkpoint 불러오기 checkpoint = torch.load(load_path, map_location=device) # KoGPT-2 언어 모델 학습을 위한 GPT2LMHeadModel 선언 kogpt2model = GPT2LMHeadModel(config=GPT2Config.from_dict(kogpt2_config)) kogpt2model.load_state_dict(checkpoint['model_state_dict']) kogpt2model.eval() vocab_b_obj = gluonnlp.vocab.BERTVocab.from_sentencepiece(vocab_path, mask_token=None, sep_token=None, cls_token=None, unknown_token='<unk>', padding_token='<pad>', bos_token='<s>', eos_token='</s>') ### 4. Tokenizer tok_path = get_tokenizer() model, vocab = kogpt2model, vocab_b_obj tok = SentencepieceTokenizer(tok_path) ### 5. Text Generation result = [] usr_sent = tmp_sent sent = '' for j in range(10): if sent == '': sent = sent + usr_sent else: sent = generated_text # print(sent) ## print result result.append(sent) toked = tok(sent) count = 0 generated_text = '' input_size = 50 if len(toked) > 1022: break while(1): input_ids = torch.tensor([vocab[vocab.bos_token], ] + vocab[toked]).unsqueeze(0) predicts = model(input_ids) pred = predicts[0] # print('predicts:', torch.argmax(pred, axis=-1).squeeze()) # gen = vocab.to_tokens(torch.argmax(pred, axis=-1).squeeze().tolist())[-1] gen = topkSampling(pred, 10, vocab) if '</s>' in gen: gen = gen.replace('</s>', '') # if gen == '</s>': # print('to_tokens:',vocab.to_tokens(torch.argmax(pred, axis=-1).squeeze().tolist())) if '.' in gen or count > input_size: sent += gen.replace('▁', ' ').replace('</', '') generated_text += gen.replace('▁', ' ').replace('</', '') # sent += '\n' # generated_text += '\n' toked = tok(sent) count = 0 break # print('to_tokens:',vocab.to_tokens(torch.argmax(pred, axis=-1).squeeze().tolist())) # if count >= input_size: # break sent += gen.replace('▁', ' ').replace('<', '') generated_text += gen.replace('▁', ' ').replace('<', '') toked = tok(sent) count += 1 # print('result:') # print(sent) split = sent.split('\n') # print(split) if len(split) > 1: # print(split[1]) if sent == split[1]: break result = ''.join(result) return result
def main(temperature=0.7, top_p=0.8, top_k=40, tmp_sent="", text_size=100, loops=0, load_path=""): ctx = 'cuda' cachedir = '~/kogpt2/' save_path = './checkpoint/' # download model model_info = pytorch_kogpt2 model_path = download(model_info['url'], model_info['fname'], model_info['chksum'], cachedir=cachedir) # download vocab vocab_info = tokenizer vocab_path = download(vocab_info['url'], vocab_info['fname'], vocab_info['chksum'], cachedir=cachedir) # Device 설정 device = torch.device(ctx) # 저장한 Checkpoint 불러오기 checkpoint = torch.load(load_path, map_location=device) # KoGPT-2 언어 모델 학습을 위한 GPT2LMHeadModel 선언 kogpt2model = GPT2LMHeadModel(config=GPT2Config.from_dict(kogpt2_config)) kogpt2model.load_state_dict(checkpoint['model_state_dict']) kogpt2model.eval() vocab_b_obj = gluonnlp.vocab.BERTVocab.from_sentencepiece( vocab_path, mask_token=None, sep_token=None, cls_token=None, unknown_token='<unk>', padding_token='<pad>', bos_token='<s>', eos_token='</s>') tok_path = get_tokenizer() model, vocab = kogpt2model, vocab_b_obj tok = SentencepieceTokenizer(tok_path) if loops: num = 1 else: num = 0 while 1: sent = '' if tmp_sent == "": tmp_sent = input('input : ') sent = sent + tmp_sent toked = tok(sent) if len(toked) > 1022: break sent = sample_sequence(model, tok, vocab, sent, text_size, temperature, top_p, top_k) sent = sent.replace("<unused0>", "\n") # 비효율적이지만 엔터를 위해서 등장 sent = auto_enter(sent) print(sent) now = [int(n) for n in os.listdir("./samples")] if len(now) == 0: now = 0 else: now = max(now) #now = max(now) f = open("samples/" + str(now + 1), 'w', encoding="utf-8") head = [load_path, tmp_sent, text_size, temperature, top_p, top_k] head = [str(h) for h in head] f.write(",".join(head)) f.write("\n") f.write(sent) f.close() tmp_sent = "" if num != 0: num += 1 if num >= loops: print("good") return
import os import torch import platform import sentencepiece from kogpt2.utils import get_tokenizer from kogpt2.pytorch_kogpt2 import get_pytorch_kogpt2_model from flask import Flask, request, jsonify, __version__ as flaskver tok_path = get_tokenizer(cachedir='./bin/') model, vocab = get_pytorch_kogpt2_model(cachedir='./bin/') tok = sentencepiece.SentencePieceProcessor(tok_path) app = Flask(__name__) port = int(os.getenv('port', '8080')) @app.route('/', methods=['GET']) def root(): env = { 'python': platform.python_version(), 'flask': flaskver, 'pytorch': torch.__version__ } urls = { 'original': 'https://github.com/SKT-AI/KoGPT2', 'fork': 'https://github.com/pmh-only/KoGPT2' } usage = 'GET /job?query=<sentence>[&loop=<loopLimit>]' return jsonify(label='kogpt2', urls=urls, env=env, usage=usage)
def main(temperature=0.7, top_p=0.8, top_k=40, tmp_sent="", text_size=100, loops=-1, load_path='./checkpoint/KoGPT2_checkpoint_long.tar', ctx='cpu', cachedir='~/kogpt2/', samples="./samples"): pytorch_kogpt2 = { 'url': 'https://kobert.blob.core.windows.net/models/kogpt2/pytorch/pytorch_kogpt2_676e9bcfa7.params', 'fname': 'pytorch_kogpt2_676e9bcfa7.params', 'chksum': '676e9bcfa7' } kogpt2_config = { "initializer_range": 0.02, "layer_norm_epsilon": 1e-05, "n_ctx": 1024, "n_embd": 768, "n_head": 12, "n_layer": 12, "n_positions": 1024, "vocab_size": 50000 } model_info = pytorch_kogpt2 model_path = download(model_info['url'], model_info['fname'], model_info['chksum'], cachedir=cachedir) vocab_info = tokenizer vocab_path = download(vocab_info['url'], vocab_info['fname'], vocab_info['chksum'], cachedir=cachedir) device = torch.device(ctx) # 저장한 Checkpoint 불러오기 checkpoint = torch.load(load_path, map_location=device) # KoGPT-2 언어 모델 학습을 위한 GPT2LMHeadModel 선언 kogpt2model = GPT2LMHeadModel(config=GPT2Config.from_dict(kogpt2_config)) kogpt2model.load_state_dict(checkpoint['model_state_dict']) kogpt2model.eval() vocab_b_obj = gluonnlp.vocab.BERTVocab.from_sentencepiece( vocab_path, mask_token=None, sep_token=None, cls_token=None, unknown_token='<unk>', padding_token='<pad>', bos_token='<s>', eos_token='</s>') tok_path = get_tokenizer() model, vocab = kogpt2model, vocab_b_obj vocab.token_to_idx["\n"] = vocab.token_to_idx["<unused0>"] del vocab.token_to_idx["<unused0>"] tok = SentencepieceTokenizer(tok_path) num = 0 sent_dict = {} if loops != -1: num = 1 while 1: sent = '' if tmp_sent == "": tmp_sent = input('input : ') sent = sent + tmp_sent toked = tok(sent) if len(toked) > 1022: break sent = sample_sequence(model, tok, vocab, sent, text_size, temperature, top_p, top_k) sent = sent.replace("<unused0>", "\n") # 비효율적이지만 엔터를 위해서 등장 sent = auto_enter(sent) # print(sent) sent_dict[num] = sent now = [int(n) for n in os.listdir(samples)] now = max(now) f = open(samples + str(now + 1), 'w', encoding="utf-8") f.write(sent) f.close() if num: num += 1 if num >= loops: print("good") return sent_dict
def main(args): # toker = GPT2Tokenizer.from_pretrained('gpt2') tok_path = get_tokenizer() toker = SentencepieceTokenizer(tok_path) _, vocab = get_pytorch_kogpt2_model() attrs = [] if args.reverse: attrs.append('reverse') if args.two_turn: attrs.append('2turn') if attrs: db_path = (f'{args.corpus[:-4]}.{args.max_seq_len}len.' f'{".".join(attrs)}.db/db') else: db_path = f'{args.corpus[:-4]}.{args.max_seq_len}len.db/db' if exists(dirname(db_path)): raise ValueError('Found existing DB, please backup') else: os.makedirs(dirname(db_path)) with shelve.open(db_path, 'n') as db: # reader = open(args.corpus, "r", encoding="utf-8") reader = pd.read_csv(args.corpus, sep='\t', header=None) chunk = [] n_chunk = 0 n_example = 0 # print("pdb-attach") # from pdb_clone import pdb # rsock = pdb.set_trace_remote() # # if rsock.state != rsock.ST_CONNECTED: # input() for _, line in tqdm(reader.iterrows(), total=len(reader.index)): try: if len(chunk) >= args.chunk_size: # save and renew chunk db[f'chunk_{n_chunk}'] = gzip.compress( json.dumps(chunk[:args.chunk_size]).encode('utf-8')) chunk = chunk[args.chunk_size:] n_chunk += 1 weights, inputs = _get_inputs_from_text(line, toker, vocab) if args.reverse: weights = list(reversed(weights)) inputs = list(reversed(inputs)) if args.two_turn: weights = weights[:2] inputs = inputs[:2] if len(weights) < 2: continue features = _make_features(n_example, weights, inputs, toker, vocab, args.max_seq_len) for feature in features: chunk.append(vars(feature)) n_example += 1 except Exception as e: print('!!! prepro exception !!!', e) continue # save last chunk db[f'chunk_{n_chunk}'] = gzip.compress( json.dumps(chunk).encode('utf-8')) # save relevant information to reproduce meta = { 'n_example': n_example, 'chunk_size': args.chunk_size, 'max_seq_len': args.max_seq_len, 'reverse': args.reverse, 'two_turn': args.two_turn } with open(join(dirname(db_path), 'meta.json'), 'w') as writer: json.dump(meta, writer, indent=4)
def main(temperature=0.7, top_p=0.8, top_k=40, tmp_sent="", text_size=100, loops=-1, load_path='./checkpoint/KoGPT2_checkpoint_long.tar', ctx='cuda', cachedir='~/kogpt2/', samples="./gdrive/My Drive/KoGPT2-FineTuning_pre/samples/"): pytorch_kogpt2 = { 'url': 'https://kobert.blob.core.windows.net/models/kogpt2/pytorch/pytorch_kogpt2_676e9bcfa7.params', 'fname': 'pytorch_kogpt2_676e9bcfa7.params', 'chksum': '676e9bcfa7' } kogpt2_config = { "initializer_range": 0.02, "layer_norm_epsilon": 1e-05, "n_ctx": 1024, "n_embd": 768, "n_head": 12, "n_layer": 12, "n_positions": 1024, "vocab_size": 50000 } model_info = pytorch_kogpt2 model_path = download(model_info['url'], model_info['fname'], model_info['chksum'], cachedir=cachedir) vocab_info = tokenizer vocab_path = download(vocab_info['url'], vocab_info['fname'], vocab_info['chksum'], cachedir=cachedir) device = torch.device(ctx) # 저장한 Checkpoint 불러오기 checkpoint = torch.load(load_path, map_location=device) # KoGPT-2 언어 모델 학습을 위한 GPT2LMHeadModel 선언 kogpt2model = GPT2LMHeadModel(config=GPT2Config.from_dict(kogpt2_config)) kogpt2model.load_state_dict(checkpoint['model_state_dict']) kogpt2model.eval() vocab_b_obj = gluonnlp.vocab.BERTVocab.from_sentencepiece( vocab_path, mask_token=None, sep_token=None, cls_token=None, unknown_token='<unk>', padding_token='<pad>', bos_token='<s>', eos_token='</s>') tok_path = get_tokenizer() model, vocab = kogpt2model, vocab_b_obj tok = SentencepieceTokenizer(tok_path) try: load_path.split("/")[-2] except: print("path error") else: load_path = load_path.split("/")[-2] print("ok : ", load_path) while (True): sent = input() make_sentence(model, tok, vocab, sent, text_size, temperature, top_p, top_k, loops)
def train(): tok_path = get_tokenizer() model, vocab = get_mxnet_kogpt2_model(ctx=ctx) # tok = SentencepieceTokenizer(tok_path, num_best=0, alpha=0) data = pd.read_csv('Chatbot_data/ChatbotData.csv') max_len = opt.max_seq_len train_set = chat_data(data, tok_path, vocab, max_len=max_len) batch_size = opt.batch_size train_dataloader = mx.gluon.data.DataLoader(train_set, batch_size=batch_size, num_workers=5, shuffle=True) kogptqa = KoGPT2Chat(model) kogptqa.hybridize() # softmax cross entropy loss for classification loss_function = gluon.loss.SoftmaxCrossEntropyLoss() loss_function.hybridize() num_epochs = opt.num_epoch lr = 5e-5 trainer = gluon.Trainer(kogptqa.collect_params(), 'bertadam', { 'learning_rate': lr, 'epsilon': 1e-8, 'wd': 0.01 }) # LayerNorm과 Bias에는 Weight Decay를 적용하지 않는다. for _, v in kogptqa.collect_params('.*beta|.*gamma|.*bias').items(): v.wd_mult = 0.0 params = [ p for p in kogptqa.collect_params().values() if p.grad_req != 'null' ] # learning rate warmup accumulate = opt.accumulate step_size = batch_size * accumulate if accumulate else batch_size num_train_examples = len(train_set) num_train_steps = int(num_train_examples / step_size * num_epochs) warmup_ratio = 0.1 num_warmup_steps = int(num_train_steps * warmup_ratio) step_num = 0 all_model_params = kogptqa.collect_params() log_interval = 50 neg = -1e18 # Set grad_req if gradient accumulation is required if accumulate and accumulate > 1: for p in params: p.grad_req = 'add' for epoch_id in range(num_epochs): step_loss = 0 for batch_id, (token_ids, mask, label) in enumerate(train_dataloader): if step_num < num_warmup_steps: new_lr = lr * step_num / num_warmup_steps else: non_warmup_steps = step_num - num_warmup_steps offset = non_warmup_steps / (num_train_steps - num_warmup_steps) new_lr = lr - offset * lr trainer.set_learning_rate(new_lr) with mx.autograd.record(): # load data to GPU or GPU token_ids = token_ids.as_in_context(ctx) mask = mask.as_in_context(ctx) label = label.as_in_context(ctx) # forward computation out = kogptqa(token_ids) masked_out = nd.where( mask.expand_dims(axis=2).repeat(repeats=out.shape[2], axis=2), out, neg * nd.ones_like(out)) # loss for responses exincluding MASK and PAD ls = loss_function(masked_out, label).sum() / mask.sum() # backward computation ls.backward() if not accumulate or (batch_id + 1) % accumulate == 0: trainer.allreduce_grads() nlp.utils.clip_grad_global_norm(params, 1) trainer.update(accumulate if accumulate else 1) step_num += 1 if accumulate and accumulate > 1: # set grad to zero for gradient accumulation all_model_params.zero_grad() step_loss += ls.asscalar() if step_num % log_interval == 0 and step_num > 0: print( '[Epoch {} Batch {}/{}] loss={:.4f}, lr={:.10f}, train ppl={:.3f}' .format(epoch_id + 1, batch_id + 1, len(train_dataloader), step_loss / log_interval, trainer.learning_rate, math.exp(step_loss / log_interval))) step_loss = 0 logging.info('saving model file to {}'.format(opt.model_params)) kogptqa.save_parameters(opt.model_params)
def main(epoch, save_path, load_path, samples, data_file_path, batch_size): ctx = 'cuda' cachedir = '~/kogpt2/' summary = SummaryWriter() # download model model_info = pytorch_kogpt2 model_path = download(model_info['url'], model_info['fname'], model_info['chksum'], cachedir=cachedir) # download vocab vocab_info = tokenizer vocab_path = download(vocab_info['url'], vocab_info['fname'], vocab_info['chksum'], cachedir=cachedir) # KoGPT-2 언어 모델 학습을 위한 GPT2LMHeadModel 선언 kogpt2model = GPT2LMHeadModel(config=GPT2Config.from_dict(kogpt2_config)) # model_path 로부터 다운로드 받은 내용을 load_state_dict 으로 업로드 kogpt2model.load_state_dict(torch.load(model_path)) device = torch.device(ctx) kogpt2model.to(device) # 불러오기 부분 try: checkpoint = torch.load(load_path, map_location=device) # KoGPT-2 언어 모델 학습을 위한 GPT2LMHeadModel 선언 kogpt2model = GPT2LMHeadModel( config=GPT2Config.from_dict(kogpt2_config)) kogpt2model.load_state_dict(checkpoint['model_state_dict']) kogpt2model.eval() except: count = 0 else: count = int(re.findall("\d+", load_path)[1]) print(count) # 추가로 학습하기 위해 .train() 사용 kogpt2model.train() vocab_b_obj = gluonnlp.vocab.BERTVocab.from_sentencepiece( vocab_path, mask_token=None, sep_token=None, cls_token=None, unknown_token='<unk>', padding_token='<pad>', bos_token='<s>', eos_token='</s>') tok_path = get_tokenizer() model, vocab = kogpt2model, vocab_b_obj tok = SentencepieceTokenizer(tok_path) dataset = Read_Dataset(data_file_path, vocab, tok) data_loader = DataLoader(dataset, batch_size=batch_size, shuffle=True, pin_memory=True) learning_rate = 3e-5 criterion = torch.nn.CrossEntropyLoss() optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate) print('KoGPT-2 Transfer Learning Start') avg_loss = (0.0, 0.0) for epoch in range(epoch): for data in data_loader: optimizer.zero_grad() data = torch.stack( data) # list of Tensor로 구성되어 있기 때문에 list를 stack을 통해 변환해준다. data = data.transpose(1, 0) data = data.to(ctx) model = model.to(ctx) outputs = model(data, labels=data) loss, logits = outputs[:2] loss = loss.to(ctx) loss.backward() avg_loss = (avg_loss[0] * 0.99 + loss, avg_loss[1] * 0.99 + 1.0) optimizer.step() if count % 10 == 0: print( 'epoch no.{0} train no.{1} loss = {2:.5f} avg_loss = {3:.5f}' .format(epoch, count, loss, avg_loss[0] / avg_loss[1])) summary.add_scalar('loss/avg_loss', avg_loss[0] / avg_loss[1], count) summary.add_scalar('loss/loss', loss, count) # generator 진행 if (count > 0 and count % 1000 == 0) or (len(data) < batch_size): sent = sample_sequence(model.to("cpu"), tok, vocab, sent="사랑", text_size=100, temperature=0.7, top_p=0.8, top_k=40) sent = sent.replace("<unused0>", "\n") print(sent) summary.add_text('Text', sent, count) if count > 500000: now = [int(n) for n in os.listdir(samples)] now = max(now) f = open(samples + str(now + 1), 'w', encoding="utf-8") f.write(sent) f.close() ######################################### count += 1 if (count > 0 and count % 10000 == 0) or (len(data) < batch_size): # 모델 저장 try: torch.save( { 'epoch': epoch, 'train_no': count, 'model_state_dict': model.state_dict(), 'optimizer_state_dict': optimizer.state_dict(), 'loss': loss }, save_path + 'KoGPT2_checkpoint_' + str(count) + '.tar') except: pass
def main(epoch = 200, save_path = './checkpoint/', load_path = './checkpoint/KoGPT2_checkpoint_long.tar', data_file_path = 'dataset/lyrics_dataset.txt', batch_size = 8, summary_url = 'runs/', new = 0, text_size = 100): ctx = 'cuda' cachedir = '~/kogpt2/' summary = SummaryWriter(summary_url) pytorch_kogpt2 = { 'url': 'https://kobert.blob.core.windows.net/models/kogpt2/pytorch/pytorch_kogpt2_676e9bcfa7.params', 'fname': 'pytorch_kogpt2_676e9bcfa7.params', 'chksum': '676e9bcfa7' } kogpt2_config = { "initializer_range": 0.02, "layer_norm_epsilon": 1e-05, "n_ctx": 1024, "n_embd": 768, "n_head": 12, "n_layer": 12, "n_positions": 1024, "vocab_size": 50000 } # download model model_info = pytorch_kogpt2 model_path = download(model_info['url'], model_info['fname'], model_info['chksum'], cachedir=cachedir) # download vocab vocab_info = tokenizer vocab_path = download(vocab_info['url'], vocab_info['fname'], vocab_info['chksum'], cachedir=cachedir) # KoGPT-2 언어 모델 학습을 위한 GPT2LMHeadModel 선언 kogpt2model = GPT2LMHeadModel(config=GPT2Config.from_dict(kogpt2_config)) # model_path 로부터 다운로드 받은 내용을 load_state_dict 으로 업로드 kogpt2model.load_state_dict(torch.load(model_path)) device = torch.device(ctx) kogpt2model.to(device) count = 0 # 불러오기 부분 try: checkpoint = torch.load(load_path, map_location=device) # KoGPT-2 언어 모델 학습을 위한 GPT2LMHeadModel 선언 kogpt2model = GPT2LMHeadModel(config=GPT2Config.from_dict(kogpt2_config)) kogpt2model.load_state_dict(checkpoint['model_state_dict']) kogpt2model.eval() except: print("count 0 : ", load_path) else: print("count check : ",re.findall("\d+", load_path)) count = max([int(i) for i in (re.findall("\d+", load_path))]) if new: count = 0 # 추가로 학습하기 위해 .train() 사용 kogpt2model.train() vocab_b_obj = gluonnlp.vocab.BERTVocab.from_sentencepiece(vocab_path, mask_token=None, sep_token=None, cls_token=None, unknown_token='<unk>', padding_token='<pad>', bos_token='<s>', eos_token='</s>') tok_path = get_tokenizer() model, vocab = kogpt2model, vocab_b_obj sentencepieceTokenizer = SentencepieceTokenizer(tok_path) dataset = Read_Dataset(data_file_path, vocab, sentencepieceTokenizer) data_loader = DataLoader(dataset, batch_size=batch_size, shuffle=True, pin_memory=True) learning_rate = 3e-5 criterion = torch.nn.CrossEntropyLoss() optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate) ## train # vocab.token_to_idx["\n"] = vocab.token_to_idx["<unused0>"] # del vocab.token_to_idx["<unused0>"] # vocab.token_to_idx["<|endoftext|>"] = vocab.token_to_idx["<unused1>"] # del vocab.token_to_idx["<unused1>"] model = model.to(ctx) tok = SentencepieceTokenizer(tok_path) print('KoGPT-2 Transfer Learning Start') avg_loss = (0.0, 0.0) for epoch in range(epoch): for data in data_loader: optimizer.zero_grad() data = torch.stack(data) # list of Tensor로 구성되어 있기 때문에 list를 stack을 통해 변환해준다. data = data.transpose(1,0) data = data.to(ctx) model = model.to(ctx) outputs = model(data, labels=data) loss, logits = outputs[:2] loss = loss.to(ctx) loss.backward() avg_loss = (avg_loss[0] * 0.99 + loss, avg_loss[1] * 0.99 + 1.0) optimizer.step() if count % 10 == 0: print('epoch no.{0} train no.{1} loss = {2:.5f} avg_loss = {3:.5f}' . format(epoch, count, loss, avg_loss[0] / avg_loss[1])) summary.add_scalar('loss/avg_loss', avg_loss[0] / avg_loss[1], count) summary.add_scalar('loss/loss', loss, count) # print("save") # torch.save({ # 'epoch': epoch, # 'train_no': count, # 'model_state_dict': model.state_dict(), # 'optimizer_state_dict': optimizer.state_dict(), # 'loss': loss # }, save_path + 'KoGPT2_checkpoint_' + str(count) + '.tar') #generator 진행 if (count > 0 and count % 1000 == 0) or (len(data) < batch_size): sent = sample_sequence(model.to("cpu"), tok, vocab, sent="성실", text_size=text_size, temperature=0.7, top_p=0.8, top_k=40) sent = sent.replace("<unused0>", "\n") # 비효율적이지만 엔터를 위해서 등장 sent = auto_enter(sent) print(sent) summary.add_text('Text', sent, count) del sent pass ######################################### if (count > 0 and count % 18500 == 0): # 모델 저장 try: torch.save({ 'epoch': epoch, 'train_no': count, 'model_state_dict': model.state_dict(), 'optimizer_state_dict': optimizer.state_dict(), 'loss': loss }, save_path + 'KoGPT2_checkpoint_' + str(count) + '.tar') except: pass count += 1
def __init__(self): self.tok_path = get_tokenizer() self.tok = SentencepieceTokenizer( self.tok_path, num_best=0, alpha=0)
def __init__(self, vocab, MAX_LEN=1024): self.q_token = U_TKN # BOS os Q self.a_token = S_TKN # BOS os A self.bos = BOS self.eos = EOS self.maskt = MASK self.sent_token = SENT #----------------------------------- self.folder_path = "./TK_data/TT_data" self.CONTEXT_IN = [] self.MASK_IN = [] self.LABELS_IN = [] self.MAX_LEN = MAX_LEN #self.DATA = pd.read_csv('./TK_data/Chatbot_data/ChatbotData.csv') self._tok_path = get_tokenizer() self.tokenizer = None if self.tokenizer is None: self._activate_sp() self.first = True self.vocab = vocab self.padder = nlp.data.PadSequence( MAX_LEN, pad_val=self.vocab[self.vocab.padding_token]) #========================================================== for file_path in glob.glob(self.folder_path + "/*.txt"): file = open(file_path, 'r', encoding='utf-8') even_or_odd = 0; CONTEXT_IN = [] MASK_IN = [] LABELS_IN = [] while True: data = file.readline() print("\n\n\nTK: {}\n\n\n".format(data)) if not data: break q_toked = self.tokenizer(data[:-1]) #print("S : {}\n".format(data[:-1])) if even_or_odd % 2 == 0 : CONTEXT_IN_TEMP = [self.q_token] + q_toked + [self.eos] CONTEXT_IN += CONTEXT_IN_TEMP MASK_IN += [0] * len(CONTEXT_IN_TEMP) LABELS_IN += [self.maskt] * len(CONTEXT_IN_TEMP) else : CONTEXT_IN_TEMP = [self.a_token] + q_toked + [self.eos] CONTEXT_IN += CONTEXT_IN_TEMP MASK_IN += [1] * len(CONTEXT_IN_TEMP) LABELS_IN += CONTEXT_IN_TEMP even_or_odd += 1 #print("I : {}\n".format(CONTEXT_IN)) #print("I2 : {}\n".format(MASK_IN)) CONTEXT_LEN = len(CONTEXT_IN) if CONTEXT_LEN > self.MAX_LEN: raise Exception('None expected CONTEXT_LEN : {}'.format(CONTEXT_LEN)) pad_token_len = MAX_LEN - CONTEXT_LEN MASK_IN += [0] * pad_token_len self.CONTEXT_IN.append(CONTEXT_IN) # [0, 0, 0, 0, ....., 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, .... ] self.MASK_IN.append(MASK_IN) # [mask, mask, ...., mask, ..., <bos>,..A.. <eos>, <pad>....] self.LABELS_IN.append(LABELS_IN) file.close()