Python get_kogpt2_tokenizer 예제들, kogpt2_transformers.get_kogpt2_tokenizer Python 예제들

예제 #1

0

파일 보기

파일: total_data_load.py 프로젝트: RogerHeederer/chatbot

    def __init__(
            self,
            file_path="/content/drive/My Drive/RogerHeederer/ChatBot/KoGPT2_Wellness/data/total.txt",
            n_ctx=1024):
        self.file_path = file_path
        self.data = []
        self.tokenizer = get_kogpt2_tokenizer()

        bos_token_id = [self.tokenizer.bos_token_id]  #<s>
        eos_token_id = [self.tokenizer.eos_token_id]  #</s>
        pad_token_id = [self.tokenizer.pad_token_id]  #<pad>

        file = open(self.file_path, 'r', encoding='utf-8')

        while True:
            line = file.readline()
            if not line:
                break
            datas = line.split("    ")  # 질문과 답변을 "    " 단위로 나눈다.
            #index_of_words = <s>질문</s><pad> + <s>답변</s><pad>
            index_of_words = bos_token_id + self.tokenizer.encode(
                datas[0]
            ) + eos_token_id + bos_token_id + self.tokenizer.encode(
                datas[1][:-1]) + eos_token_id
            pad_token_len = n_ctx - len(index_of_words)  #문장 max 길이에서 현재 길이값 빼기

            index_of_words += pad_token_id * pad_token_len
            self.data.append(index_of_words)  # 남은 자리에 패딩처리
        file.close()

예제 #2

0

파일 보기

  def __init__(self,
               file_path = "../data/wellness_dialog_for_autoregressive.txt",
               n_ctx = 1024
               ):
    self.file_path = file_path
    self.data =[]
    self.tokenizer = get_kogpt2_tokenizer()


    bos_token_id = [self.tokenizer.bos_token_id]
    eos_token_id = [self.tokenizer.eos_token_id]
    pad_token_id = [self.tokenizer.pad_token_id]

    file = open(self.file_path, 'r', encoding='utf-8')

    while True:
      line = file.readline()
      if not line:
        break
      datas = line.split("    ")
      index_of_words = bos_token_id +self.tokenizer.encode(datas[0]) + eos_token_id + bos_token_id + self.tokenizer.encode(datas[1][:-1])+ eos_token_id
      pad_token_len = n_ctx - len(index_of_words)

      index_of_words += pad_token_id * pad_token_len

      self.data.append(index_of_words)

    file.close()

예제 #3

0

파일 보기

class Chat(chat_pb2_grpc.ChatServicer):
    # WEB_ASK_06DEVBROS/ai/chatbot/checkpoint에 저장된 pth 파일(pytorch weight 파일)을 불러옴

    root_path = str(
        pathlib.Path(__file__).parent.absolute()) + '/../ai/chatbot'
    checkpoint_path = f"{root_path}/checkpoint"
    save_ckpt_path = f"{checkpoint_path}/kogpt2-wellness-auto-regressive.pth"

    ctx = "cuda" if torch.cuda.is_available() else "cpu"
    device = torch.device(ctx)

    # 저장한 Checkpoint 불러오기
    checkpoint = torch.load(save_ckpt_path, map_location=device)
    model = DialogKoGPT2()
    model.load_state_dict(checkpoint['model_state_dict'])
    model.eval()
    tokenizer = get_kogpt2_tokenizer()

    def ChatBot(self, request, context):
        reqChat = request.clientChat  #들어온 채팅 데이터

        tokenized_indexs = self.tokenizer.encode(reqChat)
        input_ids = torch.tensor([
            self.tokenizer.bos_token_id,
        ] + tokenized_indexs + [self.tokenizer.eos_token_id]).unsqueeze(0)
        output = self.model.generate(input_ids=input_ids)
        resChat = self.tokenizer.decode(
            output[0].tolist()[len(tokenized_indexs) + 1:],
            skip_special_tokens=True)

        return chat_pb2.ChatReply(serverChat=resChat)

예제 #4

0

파일 보기

def tweeter_autoregressive_data():
    root_path = "../data"
    tokenizer = get_kogpt2_tokenizer()
    # wellness_autoregressive_file = root_path+"/wellness_dialog_for_autoregressive.txt"
    # wellness_text_classification_file = root_path + "/wellness_dialog_for_text_classification.txt"
    file_path = root_path + "/tweeter_dialog_data.txt"
    tweeter_autoregressive_file = root_path + "/tweeter_dialog_for_autoregressive.txt"

    data_file = open(file_path, 'r')
    tweet_file = open(tweeter_autoregressive_file, 'w')

    data_file_lines = data_file.readlines()
    dialog = ''
    max_len = 0
    for line_num, line_data in enumerate(data_file_lines):
        if line_data == "\n" and dialog != '':
            dialog += "\n"
            tweet_file.write(dialog)
            print(dialog)
            dialog = ''
        elif line_data != "\n":
            tmp_data = dialog + "<s>" + line_data[:-1] + "</s>"
            if len(tokenizer.encode(tmp_data)) >= 1024:
                continue
            else:
                max_len = max(len(tokenizer.encode(tmp_data)), max_len)
                dialog = tmp_data
    print('max_token_length: ', max_len)
    data_file.close()
    tweet_file.close()

예제 #5

0

파일 보기

def token_num(data_path='./data/train.jsonl'):
    data = []
    with open(data_path, 'r') as json_file:
        json_list = list(json_file)

    bert_tok = get_tokenizer()
    gpt_tok = get_kogpt2_tokenizer()

    bert_tok_num = 0
    gpt_tok_num = 0

    count = 0
    for json_str in json_list:
        json_data = json.loads(json_str)
        tmp_str = json_data['abstractive']
        # for arti_str in json_data['article_original']:
        #   tmp_str += arti_str
        bert_tok_num = max(
            bert_tok_num,
            len(bert_tok.encode(tmp_str, max_length=512, truncation=True)))
        gpt_tok_num = max(
            gpt_tok_num,
            len(gpt_tok.encode(tmp_str, max_length=512, truncation=True)))

        # print(len(json_data['article_original']))
        # sum_len += len(json_data['article_original'])
        # count += 1
    # print('average article_original len - ', sum_len/count)
    print('max bert token len:', bert_tok_num)
    print('max gpt token len:', gpt_tok_num)

예제 #6

0

파일 보기

  def __init__(self, MAX_LEN = 2048):
    self.folder_path = "./TK_data/T0_data"
    self.DATA_PATH = []
    self.DATA_PATH_IDX =[]
    self.DATA_PATH_LEN = []
    self.previous_context = None
    self.MAX_LEN = MAX_LEN

    self.tokenizer = get_kogpt2_tokenizer()
    self.bos_token_id = [self.tokenizer.bos_token_id] # BEGIN of string  <BOS>
    self.eos_token_id = [self.tokenizer.eos_token_id] # END of string    <EOS>
    self.pad_token_id = [self.tokenizer.pad_token_id] # OTHER tokens     

    TEMP_MAX = 0
    INDEX = 0
    for file_path in glob.glob(self.folder_path + "/*.txt"):
        self.DATA_PATH.append(file_path)
        file = open(file_path, 'r', encoding='utf-8')

        data = file.readline()
        DATA_LEN = 1
        while True:
            data = file.readline()
            DATA_LEN += 1
            #if not line:
            #    break
            self.DATA_PATH_IDX.append(INDEX)
            self.DATA_PATH_LEN.append(DATA_LEN)
        INDEX +=1

예제 #7

0

파일 보기

파일: main.py 프로젝트: jugapuff/multi-model-chatbot-container

    def __init__(self):
        
        # Load Reranker model & tokenizer
        print("Load Reranker model & tokenizer")
        self.reranker_model = BertForSequenceClassification.from_pretrained("/models/reranker/checkpoint-920", num_labels=2)
        self.reranker_tokenizer = AutoTokenizer.from_pretrained("beomi/kcbert-base")
        self.reranker_tokenizer.add_special_tokens({"additional_special_tokens":["[/]"]})
        self.reranker_model.resize_token_embeddings(len(self.reranker_tokenizer))
        self.reranker_model = self.reranker_model.to("cuda")
        self.reranker_model.eval()

        # Load Classifier model & tokenizer
        print("Load Classifier model & tokenizer")
        self.classifier_model = BertForSequenceClassification.from_pretrained("/models/classifier/checkpoint-190", num_labels=167)
        self.classifier_tokenizer = AutoTokenizer.from_pretrained("beomi/kcbert-base")
        self.classifier_model = self.classifier_model.to("cuda")
        self.classifier_model.eval()

        # Load Generator model & tokenizer
        print("Load Generator model & tokenizer")
        self.generator_model = GPT2LMHeadModel.from_pretrained("/models/generator/checkpoint-851")
        self.generator_tokenizer = get_kogpt2_tokenizer()
        self.generator_tokenizer.add_special_tokens({"additional_special_tokens": ["<chatbot>"]})
        self.generator_model.resize_token_embeddings(len(self.generator_tokenizer))
        self.generator_model = self.generator_model.to("cuda")
        self.generator_model.eval()


        self.history = []
        self.candidates = []

        with open("/models/label_dic", 'rb') as f:
            self.temp_dic = pickle.load(f) 
        self.labels = sorted(self.temp_dic.keys())

예제 #8

0

파일 보기

파일: chat_module.py 프로젝트: tzerok/WEB_Ask_06devbros

    def __init__(self, root_path='../ai/chatbot'):
        checkpoint_path = f"{root_path}/checkpoint"
        self.model_path = f"{checkpoint_path}/kogpt2-wellness-auto-regressive.pth"

        checkpoint = torch.load(self.model_path, map_location=device)
        self.model = DialogKoGPT2()
        self.model.load_state_dict(checkpoint['model_state_dict'])
        self.model.eval()

        self.tokenizer = get_kogpt2_tokenizer()

예제 #9

0

파일 보기

    def __init__(self, n_ctx=1024):
        self.file_path = "./TK_data/T1_wellness/T1_wellness_train.txt"
        self.DATA = []
        self.tokenizer = get_kogpt2_tokenizer()

        bos_token_id = [self.tokenizer.bos_token_id]  # BEGIN of string  <BOS>
        eos_token_id = [self.tokenizer.eos_token_id]  # END of string    <EOS>
        pad_token_id = [self.tokenizer.pad_token_id]  # OTHER tokens

        file = open(self.file_path, 'r', encoding='utf-8')

        while True:
            line = file.readline()
            if not line:
                break
            datas = line.split("    ")

            q = datas[0]
            q_toked = self.tokenizer.encode(q)
            #sentiment = analyser.polarity_scores(text))
            sentiment = vader_polarity(q)
            if sentiment == 1:
                sentiment = 'g'  #good
            else:
                sentiment = 'b'  #bad
            sent_toked = self.tokenizer.encode(sentiment)
            a = datas[1]
            a_toked = self.tokenizer.encode(a[:-1])

            #===========++++ Q token
            q_toked = bos_token_id + q_toked + eos_token_id + \
                      bos_token_id + sent_toked + eos_token_id
            q_len = len(q_toked)

            #===========++++ A token
            #a_toked = bos_token_id + sent_toked + eos_token_id + \
            a_toked = bos_token_id + a_toked + eos_token_id
            a_len = len(a_toked)

            #check padding LEN
            pad_token_len = n_ctx - q_len - a_len

            #===========++++ Padding
            index_of_words = q_toked + a_toked + pad_token_id * pad_token_len

            self.DATA.append(index_of_words)

        file.close()

예제 #10

0

파일 보기

    def __init__(self, MAX_LEN=1024):
        self.file_path = "./TK_data/T0_data/T0_data.txt"
        self.DATA = []
        self.MAX_LEN = MAX_LEN
        self.signal = 1

        self.tokenizer = get_kogpt2_tokenizer()
        bos_token_id = [self.tokenizer.bos_token_id]  # BEGIN of string  <BOS>
        eos_token_id = [self.tokenizer.eos_token_id]  # END of string    <EOS>
        pad_token_id = [self.tokenizer.pad_token_id]  # OTHER tokens

        #==========================================================
        file = open(self.file_path, 'r', encoding='utf-8')
        TK_MAX_SIZE = 0
        while True:
            line = file.readline()
            if not line:
                break
            if line == "<CONTEXT_END>\n":
                self.signal = 1
                continue
            datas = line.split("    ")

            q_toked = self.tokenizer.encode(datas[0])
            a_toked = self.tokenizer.encode(datas[1][:-1])

            #===========++++ Q token
            q_toked = bos_token_id + q_toked + eos_token_id
            q_len = len(q_toked)

            #===========++++ A token
            a_toked = bos_token_id + a_toked + eos_token_id
            a_len = len(a_toked)

            #check padding LEN
            pad_token_len = MAX_LEN - q_len - a_len
            if pad_token_len < 0:
                continue
            if TK_MAX_SIZE < q_len + a_len:
                TK_MAX_SIZE = q_len + a_len

            #===========++++ Padding
            index_of_words = q_toked + a_toked + pad_token_id * pad_token_len

            self.DATA.append(index_of_words)

        file.close()
        print("\n\n\n MAXSIZE : {}".format(TK_MAX_SIZE))

예제 #11

0

파일 보기

파일: kogpt2-text-generation_service.py 프로젝트: Oasis-hackathon/Oasisu

def chatbot_qa(Question):
    # root_path='drive/My Drive/Colab Notebooks/dialogLM'
    root_path = str(pathlib.Path(__file__).parent.absolute())
    data_path = f"{root_path}\data\wellness_dialog_for_autoregressive_train.txt"
    checkpoint_path = f"{root_path}\checkpoint"
    # save_ckpt_path = f"{checkpoint_path}/kogpt2-wellness-auto-regressive.pth"
    # save_ckpt_path = f"D:\KNHANES_7\WEB_Ask_06devbros\ai\chatbot\checkpoint\kogpt2-wellness-auto-regressive.pth"

    ctx = "cuda" if torch.cuda.is_available() else "cpu"
    device = torch.device(ctx)

    # 저장한 Checkpoint 불러오기
    # checkpoint = torch.load(save_ckpt_path, map_location=device)
    checkpoint = torch.load(
        "../checkpoint/kogpt2-wellness-auto-regressive.pth",
        map_location=device)

    model = DialogKoGPT2()
    model.load_state_dict(checkpoint['model_state_dict'])

    model.eval()

    tokenizer = get_kogpt2_tokenizer()

    count = 0
    output_size = 200  # 출력하고자 하는 토큰 갯수

    sent = Question
    tokenized_indexs = tokenizer.encode(sent)

    input_ids = torch.tensor([
        tokenizer.bos_token_id,
    ] + tokenized_indexs + [tokenizer.eos_token_id]).unsqueeze(0)
    # set top_k to 50
    sample_output = model.generate(input_ids=input_ids)
    print("Answer: " +
          tokenizer.decode(sample_output[0].tolist()[len(tokenized_indexs) +
                                                     1:],
                           skip_special_tokens=True))
    print(100 * '-')
    chatbot_answer = str(
        tokenizer.decode(sample_output[0].tolist()[len(tokenized_indexs) + 1:],
                         skip_special_tokens=True))

    return chatbot_answer

예제 #12

0

파일 보기

파일: dataset.py 프로젝트: nawnoes/dacon-ko-summarization

    def __init__(
        self,
        device,
        n_ctx=1024,
        articles_max_length=810,
        summary_max_length=210,
    ):
        self.data = []
        self.tokenizer = get_kogpt2_tokenizer()

        bos_token_id = [self.tokenizer.bos_token_id]  # <s>
        eos_token_id = [self.tokenizer.eos_token_id]  # </s>
        pad_token_id = [self.tokenizer.pad_token_id]  # <pad>

        jsonl_datas = jsonl_load()
        # for dict_data in jsonl_datas:
        for dict_data in tqdm(jsonl_datas):
            articles = dict_data['article_original']
            abstractive_summary = dict_data['abstractive']

            tmp_str = ''
            for article in articles:
                tmp_str += article

            # encode
            # truncate, if string exceed max length
            enc_tmp_str = self.tokenizer.encode(tmp_str,
                                                truncation=True,
                                                max_length=articles_max_length)
            enc_abstractive_summary = self.tokenizer.encode(
                abstractive_summary,
                truncation=True,
                max_length=summary_max_length)

            # <s> 요약할 문장 </s> 요약된 문장 </s>
            index_of_words = bos_token_id + enc_tmp_str + eos_token_id + enc_abstractive_summary + eos_token_id
            pad_token_len = n_ctx - len(index_of_words)

            index_of_words += pad_token_id * pad_token_len

            self.data.append(torch.tensor(index_of_words).to(device))

예제 #13

0

파일 보기

파일: STEP3_generation_kogpt2_T0.py 프로젝트: tk1star2/tk_kogpt2_memorable

save_step = 100 # 학습 저장 주기
learning_rate = 5e-5  # Learning Rate

# STEP2-2. dataset & MODEL
checkpoint = torch.load(save_ckpt_path, map_location=device)

model = DialogKoGPT2()
model.load_state_dict(checkpoint['model_state_dict'])
#model.to(device)

#model.eval()
model.train()

# STEP2-3. training configure
tokenizer = get_kogpt2_tokenizer()
loss_fct = torch.nn.CrossEntropyLoss(ignore_index=3)
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

#=========================FOR CONVENIENCE=============================
bos_token_id = [tokenizer.bos_token_id] # BEGIN of string  <BOS>
eos_token_id = [tokenizer.eos_token_id] # END of string    <EOS>
pad_token_id = [tokenizer.pad_token_id] # OTHER tokens     
#====================================================================
# STEP4. evaluation 
while 1:
# for i in range(5):
    sent = input('Question: ')  # '요즘 기분이 우울한 느낌이에요'
    tokenized_indexs = tokenizer.encode(sent)

    q_toked = bos_token_id + tokenized_indexs + eos_token_id