def __init__( self, file_path="/content/drive/My Drive/RogerHeederer/ChatBot/KoGPT2_Wellness/data/total.txt", n_ctx=1024): self.file_path = file_path self.data = [] self.tokenizer = get_kogpt2_tokenizer() bos_token_id = [self.tokenizer.bos_token_id] #<s> eos_token_id = [self.tokenizer.eos_token_id] #</s> pad_token_id = [self.tokenizer.pad_token_id] #<pad> file = open(self.file_path, 'r', encoding='utf-8') while True: line = file.readline() if not line: break datas = line.split(" ") # 질문과 답변을 " " 단위로 나눈다. #index_of_words = <s>질문</s><pad> + <s>답변</s><pad> index_of_words = bos_token_id + self.tokenizer.encode( datas[0] ) + eos_token_id + bos_token_id + self.tokenizer.encode( datas[1][:-1]) + eos_token_id pad_token_len = n_ctx - len(index_of_words) #문장 max 길이에서 현재 길이값 빼기 index_of_words += pad_token_id * pad_token_len self.data.append(index_of_words) # 남은 자리에 패딩처리 file.close()
def __init__(self, file_path = "../data/wellness_dialog_for_autoregressive.txt", n_ctx = 1024 ): self.file_path = file_path self.data =[] self.tokenizer = get_kogpt2_tokenizer() bos_token_id = [self.tokenizer.bos_token_id] eos_token_id = [self.tokenizer.eos_token_id] pad_token_id = [self.tokenizer.pad_token_id] file = open(self.file_path, 'r', encoding='utf-8') while True: line = file.readline() if not line: break datas = line.split(" ") index_of_words = bos_token_id +self.tokenizer.encode(datas[0]) + eos_token_id + bos_token_id + self.tokenizer.encode(datas[1][:-1])+ eos_token_id pad_token_len = n_ctx - len(index_of_words) index_of_words += pad_token_id * pad_token_len self.data.append(index_of_words) file.close()
class Chat(chat_pb2_grpc.ChatServicer): # WEB_ASK_06DEVBROS/ai/chatbot/checkpoint에 저장된 pth 파일(pytorch weight 파일)을 불러옴 root_path = str( pathlib.Path(__file__).parent.absolute()) + '/../ai/chatbot' checkpoint_path = f"{root_path}/checkpoint" save_ckpt_path = f"{checkpoint_path}/kogpt2-wellness-auto-regressive.pth" ctx = "cuda" if torch.cuda.is_available() else "cpu" device = torch.device(ctx) # 저장한 Checkpoint 불러오기 checkpoint = torch.load(save_ckpt_path, map_location=device) model = DialogKoGPT2() model.load_state_dict(checkpoint['model_state_dict']) model.eval() tokenizer = get_kogpt2_tokenizer() def ChatBot(self, request, context): reqChat = request.clientChat #들어온 채팅 데이터 tokenized_indexs = self.tokenizer.encode(reqChat) input_ids = torch.tensor([ self.tokenizer.bos_token_id, ] + tokenized_indexs + [self.tokenizer.eos_token_id]).unsqueeze(0) output = self.model.generate(input_ids=input_ids) resChat = self.tokenizer.decode( output[0].tolist()[len(tokenized_indexs) + 1:], skip_special_tokens=True) return chat_pb2.ChatReply(serverChat=resChat)
def tweeter_autoregressive_data(): root_path = "../data" tokenizer = get_kogpt2_tokenizer() # wellness_autoregressive_file = root_path+"/wellness_dialog_for_autoregressive.txt" # wellness_text_classification_file = root_path + "/wellness_dialog_for_text_classification.txt" file_path = root_path + "/tweeter_dialog_data.txt" tweeter_autoregressive_file = root_path + "/tweeter_dialog_for_autoregressive.txt" data_file = open(file_path, 'r') tweet_file = open(tweeter_autoregressive_file, 'w') data_file_lines = data_file.readlines() dialog = '' max_len = 0 for line_num, line_data in enumerate(data_file_lines): if line_data == "\n" and dialog != '': dialog += "\n" tweet_file.write(dialog) print(dialog) dialog = '' elif line_data != "\n": tmp_data = dialog + "<s>" + line_data[:-1] + "</s>" if len(tokenizer.encode(tmp_data)) >= 1024: continue else: max_len = max(len(tokenizer.encode(tmp_data)), max_len) dialog = tmp_data print('max_token_length: ', max_len) data_file.close() tweet_file.close()
def token_num(data_path='./data/train.jsonl'): data = [] with open(data_path, 'r') as json_file: json_list = list(json_file) bert_tok = get_tokenizer() gpt_tok = get_kogpt2_tokenizer() bert_tok_num = 0 gpt_tok_num = 0 count = 0 for json_str in json_list: json_data = json.loads(json_str) tmp_str = json_data['abstractive'] # for arti_str in json_data['article_original']: # tmp_str += arti_str bert_tok_num = max( bert_tok_num, len(bert_tok.encode(tmp_str, max_length=512, truncation=True))) gpt_tok_num = max( gpt_tok_num, len(gpt_tok.encode(tmp_str, max_length=512, truncation=True))) # print(len(json_data['article_original'])) # sum_len += len(json_data['article_original']) # count += 1 # print('average article_original len - ', sum_len/count) print('max bert token len:', bert_tok_num) print('max gpt token len:', gpt_tok_num)
def __init__(self, MAX_LEN = 2048): self.folder_path = "./TK_data/T0_data" self.DATA_PATH = [] self.DATA_PATH_IDX =[] self.DATA_PATH_LEN = [] self.previous_context = None self.MAX_LEN = MAX_LEN self.tokenizer = get_kogpt2_tokenizer() self.bos_token_id = [self.tokenizer.bos_token_id] # BEGIN of string <BOS> self.eos_token_id = [self.tokenizer.eos_token_id] # END of string <EOS> self.pad_token_id = [self.tokenizer.pad_token_id] # OTHER tokens TEMP_MAX = 0 INDEX = 0 for file_path in glob.glob(self.folder_path + "/*.txt"): self.DATA_PATH.append(file_path) file = open(file_path, 'r', encoding='utf-8') data = file.readline() DATA_LEN = 1 while True: data = file.readline() DATA_LEN += 1 #if not line: # break self.DATA_PATH_IDX.append(INDEX) self.DATA_PATH_LEN.append(DATA_LEN) INDEX +=1
def __init__(self): # Load Reranker model & tokenizer print("Load Reranker model & tokenizer") self.reranker_model = BertForSequenceClassification.from_pretrained("/models/reranker/checkpoint-920", num_labels=2) self.reranker_tokenizer = AutoTokenizer.from_pretrained("beomi/kcbert-base") self.reranker_tokenizer.add_special_tokens({"additional_special_tokens":["[/]"]}) self.reranker_model.resize_token_embeddings(len(self.reranker_tokenizer)) self.reranker_model = self.reranker_model.to("cuda") self.reranker_model.eval() # Load Classifier model & tokenizer print("Load Classifier model & tokenizer") self.classifier_model = BertForSequenceClassification.from_pretrained("/models/classifier/checkpoint-190", num_labels=167) self.classifier_tokenizer = AutoTokenizer.from_pretrained("beomi/kcbert-base") self.classifier_model = self.classifier_model.to("cuda") self.classifier_model.eval() # Load Generator model & tokenizer print("Load Generator model & tokenizer") self.generator_model = GPT2LMHeadModel.from_pretrained("/models/generator/checkpoint-851") self.generator_tokenizer = get_kogpt2_tokenizer() self.generator_tokenizer.add_special_tokens({"additional_special_tokens": ["<chatbot>"]}) self.generator_model.resize_token_embeddings(len(self.generator_tokenizer)) self.generator_model = self.generator_model.to("cuda") self.generator_model.eval() self.history = [] self.candidates = [] with open("/models/label_dic", 'rb') as f: self.temp_dic = pickle.load(f) self.labels = sorted(self.temp_dic.keys())
def __init__(self, root_path='../ai/chatbot'): checkpoint_path = f"{root_path}/checkpoint" self.model_path = f"{checkpoint_path}/kogpt2-wellness-auto-regressive.pth" checkpoint = torch.load(self.model_path, map_location=device) self.model = DialogKoGPT2() self.model.load_state_dict(checkpoint['model_state_dict']) self.model.eval() self.tokenizer = get_kogpt2_tokenizer()
def __init__(self, n_ctx=1024): self.file_path = "./TK_data/T1_wellness/T1_wellness_train.txt" self.DATA = [] self.tokenizer = get_kogpt2_tokenizer() bos_token_id = [self.tokenizer.bos_token_id] # BEGIN of string <BOS> eos_token_id = [self.tokenizer.eos_token_id] # END of string <EOS> pad_token_id = [self.tokenizer.pad_token_id] # OTHER tokens file = open(self.file_path, 'r', encoding='utf-8') while True: line = file.readline() if not line: break datas = line.split(" ") q = datas[0] q_toked = self.tokenizer.encode(q) #sentiment = analyser.polarity_scores(text)) sentiment = vader_polarity(q) if sentiment == 1: sentiment = 'g' #good else: sentiment = 'b' #bad sent_toked = self.tokenizer.encode(sentiment) a = datas[1] a_toked = self.tokenizer.encode(a[:-1]) #===========++++ Q token q_toked = bos_token_id + q_toked + eos_token_id + \ bos_token_id + sent_toked + eos_token_id q_len = len(q_toked) #===========++++ A token #a_toked = bos_token_id + sent_toked + eos_token_id + \ a_toked = bos_token_id + a_toked + eos_token_id a_len = len(a_toked) #check padding LEN pad_token_len = n_ctx - q_len - a_len #===========++++ Padding index_of_words = q_toked + a_toked + pad_token_id * pad_token_len self.DATA.append(index_of_words) file.close()
def __init__(self, MAX_LEN=1024): self.file_path = "./TK_data/T0_data/T0_data.txt" self.DATA = [] self.MAX_LEN = MAX_LEN self.signal = 1 self.tokenizer = get_kogpt2_tokenizer() bos_token_id = [self.tokenizer.bos_token_id] # BEGIN of string <BOS> eos_token_id = [self.tokenizer.eos_token_id] # END of string <EOS> pad_token_id = [self.tokenizer.pad_token_id] # OTHER tokens #========================================================== file = open(self.file_path, 'r', encoding='utf-8') TK_MAX_SIZE = 0 while True: line = file.readline() if not line: break if line == "<CONTEXT_END>\n": self.signal = 1 continue datas = line.split(" ") q_toked = self.tokenizer.encode(datas[0]) a_toked = self.tokenizer.encode(datas[1][:-1]) #===========++++ Q token q_toked = bos_token_id + q_toked + eos_token_id q_len = len(q_toked) #===========++++ A token a_toked = bos_token_id + a_toked + eos_token_id a_len = len(a_toked) #check padding LEN pad_token_len = MAX_LEN - q_len - a_len if pad_token_len < 0: continue if TK_MAX_SIZE < q_len + a_len: TK_MAX_SIZE = q_len + a_len #===========++++ Padding index_of_words = q_toked + a_toked + pad_token_id * pad_token_len self.DATA.append(index_of_words) file.close() print("\n\n\n MAXSIZE : {}".format(TK_MAX_SIZE))
def chatbot_qa(Question): # root_path='drive/My Drive/Colab Notebooks/dialogLM' root_path = str(pathlib.Path(__file__).parent.absolute()) data_path = f"{root_path}\data\wellness_dialog_for_autoregressive_train.txt" checkpoint_path = f"{root_path}\checkpoint" # save_ckpt_path = f"{checkpoint_path}/kogpt2-wellness-auto-regressive.pth" # save_ckpt_path = f"D:\KNHANES_7\WEB_Ask_06devbros\ai\chatbot\checkpoint\kogpt2-wellness-auto-regressive.pth" ctx = "cuda" if torch.cuda.is_available() else "cpu" device = torch.device(ctx) # 저장한 Checkpoint 불러오기 # checkpoint = torch.load(save_ckpt_path, map_location=device) checkpoint = torch.load( "../checkpoint/kogpt2-wellness-auto-regressive.pth", map_location=device) model = DialogKoGPT2() model.load_state_dict(checkpoint['model_state_dict']) model.eval() tokenizer = get_kogpt2_tokenizer() count = 0 output_size = 200 # 출력하고자 하는 토큰 갯수 sent = Question tokenized_indexs = tokenizer.encode(sent) input_ids = torch.tensor([ tokenizer.bos_token_id, ] + tokenized_indexs + [tokenizer.eos_token_id]).unsqueeze(0) # set top_k to 50 sample_output = model.generate(input_ids=input_ids) print("Answer: " + tokenizer.decode(sample_output[0].tolist()[len(tokenized_indexs) + 1:], skip_special_tokens=True)) print(100 * '-') chatbot_answer = str( tokenizer.decode(sample_output[0].tolist()[len(tokenized_indexs) + 1:], skip_special_tokens=True)) return chatbot_answer
def __init__( self, device, n_ctx=1024, articles_max_length=810, summary_max_length=210, ): self.data = [] self.tokenizer = get_kogpt2_tokenizer() bos_token_id = [self.tokenizer.bos_token_id] # <s> eos_token_id = [self.tokenizer.eos_token_id] # </s> pad_token_id = [self.tokenizer.pad_token_id] # <pad> jsonl_datas = jsonl_load() # for dict_data in jsonl_datas: for dict_data in tqdm(jsonl_datas): articles = dict_data['article_original'] abstractive_summary = dict_data['abstractive'] tmp_str = '' for article in articles: tmp_str += article # encode # truncate, if string exceed max length enc_tmp_str = self.tokenizer.encode(tmp_str, truncation=True, max_length=articles_max_length) enc_abstractive_summary = self.tokenizer.encode( abstractive_summary, truncation=True, max_length=summary_max_length) # <s> 요약할 문장 </s> 요약된 문장 </s> index_of_words = bos_token_id + enc_tmp_str + eos_token_id + enc_abstractive_summary + eos_token_id pad_token_len = n_ctx - len(index_of_words) index_of_words += pad_token_id * pad_token_len self.data.append(torch.tensor(index_of_words).to(device))
save_step = 100 # 학습 저장 주기 learning_rate = 5e-5 # Learning Rate # STEP2-2. dataset & MODEL checkpoint = torch.load(save_ckpt_path, map_location=device) model = DialogKoGPT2() model.load_state_dict(checkpoint['model_state_dict']) #model.to(device) #model.eval() model.train() # STEP2-3. training configure tokenizer = get_kogpt2_tokenizer() loss_fct = torch.nn.CrossEntropyLoss(ignore_index=3) optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate) #=========================FOR CONVENIENCE============================= bos_token_id = [tokenizer.bos_token_id] # BEGIN of string <BOS> eos_token_id = [tokenizer.eos_token_id] # END of string <EOS> pad_token_id = [tokenizer.pad_token_id] # OTHER tokens #==================================================================== # STEP4. evaluation while 1: # for i in range(5): sent = input('Question: ') # '요즘 기분이 우울한 느낌이에요' tokenized_indexs = tokenizer.encode(sent) q_toked = bos_token_id + tokenized_indexs + eos_token_id