示例#1
0
 def __init__(self, hparams, **kwargs):
     super(KoGPT2Chat, self).__init__()
     self.hparams = hparams
     self.tok_path = get_tokenizer()
     self.neg = -1e18
     self.kogpt2, self.vocab = get_pytorch_kogpt2_model()
     self.loss_function = torch.nn.CrossEntropyLoss(reduction='none')
示例#2
0
文件: data.py 项目: tyhtm3/Photory-AI
def sentencePieceTokenizer():
    tok_path = get_tokenizer()
    sentencepieceTokenizer = SentencepieceTokenizer(tok_path,
                                                    num_best=0,
                                                    alpha=0)

    return sentencepieceTokenizer
示例#3
0
def chat(model_params, sent='0'):
    tok_path = get_tokenizer()
    model, vocab = get_mxnet_kogpt2_model(ctx=ctx)
    tok = SentencepieceTokenizer(tok_path, num_best=0, alpha=0)
    kogptqa = KoGPT2Chat(model)
    kogptqa.load_parameters(model_params, ctx=ctx)
    sent_tokens = tok(sent)
    while 1:
        q = input('user > ').strip()
        if q == 'quit':
            break
        q_tok = tok(q)
        a = ''
        a_tok = []
        while 1:
            input_ids = mx.nd.array([vocab[U_TKN]] + vocab[q_tok] +
                                    vocab[EOS, SENT] + vocab[sent_tokens] +
                                    vocab[EOS, S_TKN] +
                                    vocab[a_tok]).expand_dims(axis=0)
            pred = kogptqa(input_ids.as_in_context(ctx))
            gen = vocab.to_tokens(
                mx.nd.argmax(
                    pred,
                    axis=-1).squeeze().astype('int').asnumpy().tolist())[-1]
            if gen == EOS:
                break
            a += gen.replace('▁', ' ')
            a_tok = tok(a)
        print("Simsimi > {}".format(a.strip()))
示例#4
0
def chat(kogptqa, sent='0'):
    tok_path = get_tokenizer()
    _, vocab = get_pytorch_kogpt2_model()
    tok = SentencepieceTokenizer(tok_path, num_best=0, alpha=0)
    sent_tokens = tok(sent)
    with torch.no_grad():
        while 1:
            q = input('user > ').strip()
            if q == 'quit':
                break
            q_tok = tok(q)
            a = ''
            a_tok = []
            while 1:
                input_ids = torch.LongTensor([
                    vocab[U_TKN]] + vocab[q_tok] +
                    vocab[EOS, SENT] + vocab[sent_tokens] +
                    vocab[EOS, S_TKN] +
                    vocab[a_tok]).unsqueeze(dim=0)
                pred = kogptqa(input_ids)
                gen = vocab.to_tokens(
                    torch.argmax(
                        pred,
                        dim=-1).squeeze().numpy().tolist())[-1]
                if gen == EOS:
                    break
                a += gen.replace('▁', ' ')
                a_tok = tok(a)
            print("Simsimi > {}".format(a.strip()))
示例#5
0
def Tokenizer(item):
    item = list(np.array(item.tolist()))
    max = 0
    tok_path = get_tokenizer()
    model, vocab = get_pytorch_kogpt2_model()
    tok = SentencepieceTokenizer(tok_path, num_best=0, alpha=0)

    out = []

    for i in item:

        toked = tok(i)
        input_ids = torch.tensor([
            vocab[vocab.bos_token],
        ] + vocab[toked]).unsqueeze(0)
        size = input_ids.shape
        # print(input_ids)
        # print(input_ids.shape)
        y = torch.cat(
            [input_ids, torch.empty(1, max_seqlen - size[1])], axis=1)
        out = torch.cat([out, y], axis=0)

        print(out.shape)

    x_np = out.numpy()
    x_df = pd.DataFrame(x_np)
    x_df.to_csv('./data/encoded.csv', mode='w')
示例#6
0
 def __init__(self, hparams, **kwargs):
     super(KoGPT2Chat, self).__init__()
     self.hparams = hparams
     self.tok_path = get_tokenizer()
     self.neg = -1e18
     self.kogpt2, self.vocab = get_pytorch_kogpt2_model("cuda")
     self.loss_function = torch.nn.CrossEntropyLoss(reduction='none')
     self.max_gpu_load_train = 0
     self.max_memory_used_train = 0.0
示例#7
0
 def __init__(self, max_len=32, batch_size=64, lr=5e-5, num_epochs=1):
     super(KoGPT2Chat, self).__init__()
     self.batch_size = batch_size
     self.lr = lr
     self.max_len = max_len
     self.tok_path = get_tokenizer()
     self.num_epochs = num_epochs
     self.neg = -1e18
     self.kogpt2, self.vocab = get_pytorch_kogpt2_model()
     self.loss_function = torch.nn.CrossEntropyLoss(reduction='none')
示例#8
0
 def __init__(self, hparams, **kwargs):
     super(KoGPT2Chat, self).__init__()
     self.hparams = hparams  # hparams에 args정보 들어감
     self.tok_path = get_tokenizer()
     self.neg = -1e18
     self.kogpt2, self.vocab = get_pytorch_kogpt2_model(
     )  # 모델이랑 단어 사전 두개로 받아준다
     self.loss_function = torch.nn.CrossEntropyLoss(
         reduction='none'
     )  # 손실함수는 CrossEntropyLoss : 분류 모델(label(정답값)과 gpt2의 아웃풋(원핫인코딩))
    def __init__(self):

        self.PAD_IDX = 0
        self.UNK_IDX = 1
        self.PAD_TOKEN = 'PAD_TOKEN'
        self.UNK_TOKEN = 'UNK_TOKEN'
        self.tok=Mecab()
        _, self.vocab = get_pytorch_kogpt2_model()
        
        self.tok_path = get_tokenizer()
        self.tok2 = SentencepieceTokenizer(self.tok_path,  num_best=0, alpha=0)
示例#10
0
 def __init__(self, vocab, MAX_LEN=32):
     self.DATA = pd.read_csv('./TK_data/Chatbot_data/ChatbotData.csv')
     self._tok_path = get_tokenizer()
     self.tokenizer = None
     self.first = True
     self.q_token = U_TKN  # BOS os Q
     self.a_token = S_TKN  # BOS os A
     self.sent_token = SENT
     self.bos = BOS
     self.eos = EOS
     self.maskt = MASK
     self.vocab = vocab
     self.MAX_LEN = MAX_LEN
     self.padder = nlp.data.PadSequence(
         MAX_LEN, pad_val=self.vocab[self.vocab.padding_token])
示例#11
0
def Load_Model():
    global vocab_global
    global sent_tokens_global
    global kogptqa_global
    global tok_global
    tok_path = get_tokenizer()
    model, vocab = get_mxnet_kogpt2_model(ctx=ctx)
    tok = SentencepieceTokenizer(tok_path, num_best=0, alpha=0)
    kogptqa = KoGPT2Chat(model)
    kogptqa.load_parameters("KoGPT2-chatbot\kogpt2_chat.params", ctx=ctx)
    sent_tokens = tok("0")
    vocab_global = vocab
    sent_tokens_global = sent_tokens
    kogptqa_global = kogptqa
    tok_global = tok
示例#12
0
def dataset(file_path):
    data = []
    tokenizer = SentencepieceTokenizer(get_tokenizer())
    f = open(file_path, 'r', encoding='utf-8')

    while True:
        file = f.readline()

        if not file:
            break
        line = tokenizer(file[:-1])
        indexing_word = [vocab[vocab.bos_token]
                         ] + vocab[line] + [vocab[vocab.eos_token]]
        data.append(indexing_word)

    f.close()

    return data
示例#13
0
    def __init__(self, load_path):
        ctx = "cuda"
        cachedir = "~/kogpt2/"
        org_path = "trained_models/gpt2_j20_1007.pt"

        # download vocab
        vocab_info = tokenizer
        vocab_path = download(
            vocab_info["url"],
            vocab_info["fname"],
            vocab_info["chksum"],
            cachedir=cachedir,
        )
        # Device 설정
        device = torch.device(ctx)
        # 저장한 Checkpoint 불러오기
        checkpoint = torch.load(load_path, map_location=device)
        # 1013: special token 학습한 뒤로 keys 값이 달라져서 이와 같은 작업 필요
        checkpoint_org = torch.load(org_path, map_location=device)
        ckpt_final = {
            k: v for k, v in zip(checkpoint_org.keys(), checkpoint.values())
        }  # 원래 state_dict 에 value 를 새로운 학습 결과로 바꿔줌

        # KoGPT-2 언어 모델 학습을 위한 GPT2LMHeadModel 선언
        self.kogpt2model = GPT2LMHeadModel(config=GPT2Config.from_dict(kogpt2_config))

        self.kogpt2model.load_state_dict(ckpt_final)
        self.kogpt2model.to(device)

        self.kogpt2model.eval()
        self.vocab = gluonnlp.vocab.BERTVocab.from_sentencepiece(
            vocab_path,
            mask_token=None,
            sep_token=None,
            cls_token=None,
            unknown_token="<unk>",
            padding_token="<pad>",
            bos_token="<s>",
            eos_token="</s>",
        )

        tok_path = get_tokenizer()
        self.tok = SentencepieceTokenizer(tok_path)
示例#14
0
    def __init__(self, vocab, MAX_LEN=2048):
        self.q_token = U_TKN  # BOS os Q
        self.a_token = S_TKN  # BOS os A
        self.bos = BOS
        self.eos = EOS
        self.maskt = MASK
        self.sent_token = SENT
        #-----------------------------------

        self.folder_path = "./TK_data/T0_data"
        self.DATA_PATH = []
        self.DATA_PATH_IDX = []
        self.DATA_PATH_LEN = []
        self.previous_context = None
        self.MAX_LEN = MAX_LEN

        #self.DATA = pd.read_csv('./TK_data/Chatbot_data/ChatbotData.csv')
        self._tok_path = get_tokenizer()
        self.tokenizer = None
        self.first = True

        self.vocab = vocab
        self.padder = nlp.data.PadSequence(
            MAX_LEN, pad_val=self.vocab[self.vocab.padding_token])

        TEMP_MAX = 0
        INDEX = 0
        for file_path in glob.glob(self.folder_path + "/*.txt"):
            self.DATA_PATH.append(file_path)
            file = open(file_path, 'r', encoding='utf-8')

            data = file.readline()
            DATA_LEN = 1
            while True:
                data = file.readline()
                DATA_LEN += 1
                #if not line:
                #    break
                self.DATA_PATH_IDX.append(INDEX)
                self.DATA_PATH_LEN.append(DATA_LEN)
            INDEX += 1
def sentence_generation(random_tok, model_, vocab_):
    tok_path = get_tokenizer()
    tok = SentencepieceTokenizer(tok_path, num_best=0, alpha=0)
    n = 0
    sent = ''
    while n < 30:
        if n == 0:
            input_ids = torch.tensor([
                vocab_[vocab_.bos_token],
            ] + [random_tok]).unsqueeze(0)
        else:
            input_ids = torch.tensor([
                vocab_[vocab_.bos_token],
            ] + vocab_[toked]).unsqueeze(0)
        pred = model_(input_ids)[0]
        gen = vocab_.to_tokens(torch.argmax(pred,
                                            axis=-1).squeeze().tolist())[-1]
        if gen == '</s>':
            break
        sent += gen.replace('▁', ' ')
        toked = tok(sent)
        n += 1

    return sent
示例#16
0
device = torch.device(ctx)
kogpt2model.to(device)
# Fine Tunning을 위해 train 선언
kogpt2model.train()
# 단어 뭉치 가져오기
vocab_b_obj = nlp.vocab.BERTVocab.from_sentencepiece(vocab_path,
                                                     mask_token=None,
                                                     sep_token=None,
                                                     cls_token=None,
                                                     unknown_token='<unk>',
                                                     padding_token='<pad>',
                                                     bos_token='<s>',
                                                     eos_token='</s>')
##########################################################################################
tok_path = get_tokenizer()
vocab = vocab_b_obj
sentencepieceTokenizer = SentencepieceTokenizer(tok_path)

print("데이터 로드")

data_file_path = 'Data_crawler/dataset/삼성전자_pred/pre_삼성전자_연합인포맥스.json'

news_data = GPT_Dataset_Train(data_file_path)
news_dataset = GPTDataset(
    news_data, vocab,
    sentencepieceTokenizer)  # Torch DataLoader 형태 맞춰주는 Dataset 설정
news_data_loader = DataLoader(news_dataset,
                              batch_size=4,
                              shuffle=True,
                              pin_memory=True,
示例#17
0
def run():
    parser = ArgumentParser()
    parser.add_argument(
        "--dataset_path",
        type=str,
        default="",
        help="Path or url of the dataset. If empty download from S3.")
    parser.add_argument("--use_adapter",
                        default=False,
                        action='store_true',
                        help="Use adapter or not")
    parser.add_argument("--keyword_module",
                        type=str,
                        default="",
                        help="add, attention, ")
    parser.add_argument(
        "--model",
        type=str,
        default="openai-gpt",
        help="Model type (openai-gpt or gpt2)",
        choices=['openai-gpt',
                 'gpt2'])  # anything besides gpt2 will load openai-gpt
    parser.add_argument("--model_checkpoint",
                        type=str,
                        default="",
                        help="Path, url or short name of the model")
    parser.add_argument("--device",
                        type=str,
                        default="cuda" if torch.cuda.is_available() else "cpu",
                        help="Device (cuda or cpu)")
    parser.add_argument("--bert_model_path",
                        default="./",
                        type=str,
                        help="Bert pre-trained model path")
    parser.add_argument(
        "--vocab_file",
        default="./vocab.korean.rawtext.list",
        type=str,
        help="The vocabulary file that the BERT model was trained on.")
    parser.add_argument("--no_sample",
                        action='store_true',
                        help="Set to use greedy decoding instead of sampling")
    parser.add_argument("--max_length",
                        type=int,
                        default=50,
                        help="Maximum length of the output utterances")
    parser.add_argument("--min_length",
                        type=int,
                        default=1,
                        help="Minimum length of the output utterances")
    parser.add_argument("--seed", type=int, default=0, help="Seed")
    parser.add_argument("--temperature",
                        type=int,
                        default=0.7,
                        help="Sampling softmax temperature")
    parser.add_argument(
        "--top_k",
        type=int,
        default=50,
        help="Filter top-k tokens before sampling (<=0: no filtering)")
    parser.add_argument(
        "--top_p",
        type=float,
        default=0.9,
        help="Nucleus filtering (top-p) before sampling (<=0.0: no filtering)")
    parser.add_argument(
        "--do_lower_case",
        action='store_true',
        help="Set this flag if you are using an uncased model.")

    args = parser.parse_args()

    logging.basicConfig(level=logging.INFO)
    logger = logging.getLogger(__file__)
    logger.info(pformat(args))

    if args.model_checkpoint == "":
        if args.model == 'gpt2':
            raise ValueError(
                "Interacting with GPT2 requires passing a finetuned model_checkpoint"
            )
        else:
            args.model_checkpoint = download_pretrained_model()

    if args.seed != 0:
        random.seed(args.seed)
        torch.random.manual_seed(args.seed)
        torch.cuda.manual_seed(args.seed)

    logger.info("Get pretrained model and tokenizer")

    # Load KoBERT model and tokenizer
    bert_tokenizer = BertTokenizer.from_pretrained(
        args.vocab_file, do_lower_case=args.do_lower_case)
    bert_model = BertModel.from_pretrained(args.bert_model_path)
    bert_model.to(args.device)
    bert_model.eval()

    # Load KoGPT2 model and tokenizer
    tok_path = get_tokenizer()
    gpt_model, gpt_vocab = get_pytorch_conkogpt2_model2(
        use_adapter=args.use_adapter)
    gpt_tokenizer = SentencepieceTokenizer(tok_path)
    gpt_model.to(args.device)
    gpt_model.eval()

    model = Seq2Seq(bert_model, gpt_model, gpt_vocab, args)
    model.load_state_dict(torch.load(args.model_checkpoint), strict=False)
    model.to(args.device)
    model.eval()

    logger.info("Load test data")
    sourceList, targetList = get_test_dataset(bert_tokenizer, gpt_tokenizer,
                                              gpt_vocab, args.dataset_path)

    f1 = open((args.model_checkpoint + "_output.txt"), 'w')
    for line in zip(sourceList, targetList):
        out_ids = sample_sequence(line[0], bert_model, bert_tokenizer,
                                  gpt_model, gpt_vocab, args)
        out_texts = gpt_vocab.to_tokens(out_ids)
        for text in out_texts:
            f1.write(text.replace('▁', ' ').replace('</s>', ' '))
        """
        for id in out_ids:
            f1.write(str(id))
            f1.write(' ')
        """
        f1.write("\n")
    f1.close()
示例#18
0
def main(temperature=0.7,
         top_p=0.8,
         top_k=40,
         tmp_sent="",
         text_size=100,
         loops=-1,
         load_path='./checkpoint/KoGPT2_checkpoint_long.tar',
         ctx='cuda',
         cachedir='~/kogpt2/',
         samples="./gdrive/My Drive/KoGPT2-FineTuning_pre/samples/"):

    pytorch_kogpt2 = {
        'url':
        'https://kobert.blob.core.windows.net/models/kogpt2/pytorch/pytorch_kogpt2_676e9bcfa7.params',
        'fname': 'pytorch_kogpt2_676e9bcfa7.params',
        'chksum': '676e9bcfa7'
    }

    kogpt2_config = {
        "initializer_range": 0.02,
        "layer_norm_epsilon": 1e-05,
        "n_ctx": 1024,
        "n_embd": 768,
        "n_head": 12,
        "n_layer": 12,
        "n_positions": 1024,
        "vocab_size": 50000
    }

    model_info = pytorch_kogpt2
    model_path = download(model_info['url'],
                          model_info['fname'],
                          model_info['chksum'],
                          cachedir=cachedir)

    vocab_info = tokenizer
    vocab_path = download(vocab_info['url'],
                          vocab_info['fname'],
                          vocab_info['chksum'],
                          cachedir=cachedir)

    device = torch.device(ctx)

    # 저장한 Checkpoint 불러오기
    checkpoint = torch.load(load_path, map_location=device)

    # KoGPT-2 언어 모델 학습을 위한 GPT2LMHeadModel 선언
    kogpt2model = GPT2LMHeadModel(config=GPT2Config.from_dict(kogpt2_config))
    kogpt2model.load_state_dict(checkpoint['model_state_dict'])

    kogpt2model.eval()
    vocab_b_obj = gluonnlp.vocab.BERTVocab.from_sentencepiece(
        vocab_path,
        mask_token=None,
        sep_token=None,
        cls_token=None,
        unknown_token='<unk>',
        padding_token='<pad>',
        bos_token='<s>',
        eos_token='</s>')

    tok_path = get_tokenizer()

    model, vocab = kogpt2model, vocab_b_obj
    tok = SentencepieceTokenizer(tok_path)
    num = 0

    if loops:
        num = 1
    else:
        num = 0

    try:
        load_path.split("/")[-2]
    except:
        pass
    else:
        load_path = load_path.split("/")[-2]

    print("weight load - ", load_path)

    while 1:
        sent = ''
        if tmp_sent == "":
            tmp_sent = input('input : ')
        sent = sent + tmp_sent

        toked = tok(sent)

        if len(toked) > 1022:
            break

        # 실제 생성 코드 top_x 상위 x개 만 사전에서 가져오기
        sent = sample_sequence(model, tok, vocab, sent, text_size, temperature,
                               top_p, top_k)

        sent = sent.replace("//", "\n")  # 비효율적이지만 엔터를 위해서 등장
        sent = sent.replace("</s>", "")
        sent = auto_enter(sent)
        print(sent)  # output

        now = [int(n) for n in os.listdir(samples + load_path)]

        try:
            now = max(now)
        except:
            now = 1

        # f = open(samples + load_path + "/" + str(now + 1), 'w', encoding="utf-8")

        # head = [load_path, tmp_sent, text_size, temperature, top_p, top_k]
        # head = [str(h) for h in head]
        # f.write(",".join(head))
        # f.write(",")
        # f.write(sent)
        # f.close()

        #tmp_sent = ""

        if num != 0:
            num += 1
            if num >= loops:
                print("good")
                return
示例#19
0
文件: data.py 项目: minji-o-j/MuTube
def sentencePieceTokenizer():
	tok_path = get_tokenizer()
	sentencepieceTokenizer = SentencepieceTokenizer(tok_path)
	return sentencepieceTokenizer
示例#20
0
def get_model_result(tmp_sent):

    ### 1. koGPT2 Config
    ctx= 'cpu'#'cuda' #'cpu' #학습 Device CPU or GPU. colab의 경우 GPU 사용
    cachedir='~/nlp/' # KoGPT-2 모델 다운로드 경로
    epoch =200  # 학습 epoch
    save_path = './checkpoint'
    load_path = './checkpoint/narrativeKoGPT2_checkpoint.tar'
    #use_cuda = True # Colab내 GPU 사용을 위한 값

    pytorch_kogpt2 = {
        'url':
        'https://kobert.blob.core.windows.net/models/kogpt2/pytorch/pytorch_kogpt2_676e9bcfa7.params',
        'fname': 'pytorch_kogpt2_676e9bcfa7.params',
        'chksum': '676e9bcfa7'
    }

    kogpt2_config = {
        "initializer_range": 0.02,
        "layer_norm_epsilon": 1e-05,
        "n_ctx": 1024,
        "n_embd": 768,
        "n_head": 12,
        "n_layer": 12,
        "n_positions": 1024,
        "vocab_size": 50000
    }

    ### 2. Vocab 불러오기
    # download vocab
    vocab_info = tokenizer
    vocab_path = download(vocab_info['url'],
                           vocab_info['fname'],
                           vocab_info['chksum'],
                           cachedir=cachedir)

    ### 3. 체크포인트 및 디바이스 설정
    # Device 설정
    device = torch.device(ctx)
    # 저장한 Checkpoint 불러오기
    checkpoint = torch.load(load_path, map_location=device)

    # KoGPT-2 언어 모델 학습을 위한 GPT2LMHeadModel 선언
    kogpt2model = GPT2LMHeadModel(config=GPT2Config.from_dict(kogpt2_config))
    kogpt2model.load_state_dict(checkpoint['model_state_dict'])

    kogpt2model.eval()
    vocab_b_obj = gluonnlp.vocab.BERTVocab.from_sentencepiece(vocab_path,
                                                         mask_token=None,
                                                         sep_token=None,
                                                         cls_token=None,
                                                         unknown_token='<unk>',
                                                         padding_token='<pad>',
                                                         bos_token='<s>',
                                                         eos_token='</s>')
    ### 4. Tokenizer
    tok_path = get_tokenizer()
    model, vocab = kogpt2model, vocab_b_obj
    tok = SentencepieceTokenizer(tok_path)

    ### 5. Text Generation
    result = []
    usr_sent = tmp_sent
    sent = ''

    for j in range(10):
        if sent == '':
            sent = sent + usr_sent
        else:
            sent = generated_text

        # print(sent) ## print result
        result.append(sent)
        toked = tok(sent)
        count = 0
        generated_text = ''
        input_size = 50

        if len(toked) > 1022:
            break

        while(1):
            input_ids = torch.tensor([vocab[vocab.bos_token], ] + vocab[toked]).unsqueeze(0)
            predicts = model(input_ids)
            pred = predicts[0]
            # print('predicts:', torch.argmax(pred, axis=-1).squeeze())
            # gen = vocab.to_tokens(torch.argmax(pred, axis=-1).squeeze().tolist())[-1]
            gen = topkSampling(pred, 10, vocab)

            if '</s>' in gen:
                gen = gen.replace('</s>', '')
            # if gen == '</s>':
            # print('to_tokens:',vocab.to_tokens(torch.argmax(pred, axis=-1).squeeze().tolist()))
            if '.' in gen or count > input_size:
                sent += gen.replace('▁', ' ').replace('</', '')
                generated_text += gen.replace('▁', ' ').replace('</', '')
                # sent += '\n'
                # generated_text += '\n'
                toked = tok(sent)
                count = 0
                break
                # print('to_tokens:',vocab.to_tokens(torch.argmax(pred, axis=-1).squeeze().tolist()))
            # if count >= input_size:
            #   break
            sent += gen.replace('▁', ' ').replace('<', '')
            generated_text += gen.replace('▁', ' ').replace('<', '')

            toked = tok(sent)
            count += 1
        # print('result:')
        # print(sent)

        split = sent.split('\n')
        # print(split)
        if len(split) > 1:
            # print(split[1])
            if sent == split[1]:
                break
    result = ''.join(result)
    return result
示例#21
0
def main(temperature=0.7,
         top_p=0.8,
         top_k=40,
         tmp_sent="",
         text_size=100,
         loops=0,
         load_path=""):
    ctx = 'cuda'
    cachedir = '~/kogpt2/'
    save_path = './checkpoint/'
    # download model
    model_info = pytorch_kogpt2
    model_path = download(model_info['url'],
                          model_info['fname'],
                          model_info['chksum'],
                          cachedir=cachedir)
    # download vocab
    vocab_info = tokenizer
    vocab_path = download(vocab_info['url'],
                          vocab_info['fname'],
                          vocab_info['chksum'],
                          cachedir=cachedir)
    # Device 설정
    device = torch.device(ctx)
    # 저장한 Checkpoint 불러오기
    checkpoint = torch.load(load_path, map_location=device)

    # KoGPT-2 언어 모델 학습을 위한 GPT2LMHeadModel 선언
    kogpt2model = GPT2LMHeadModel(config=GPT2Config.from_dict(kogpt2_config))
    kogpt2model.load_state_dict(checkpoint['model_state_dict'])

    kogpt2model.eval()
    vocab_b_obj = gluonnlp.vocab.BERTVocab.from_sentencepiece(
        vocab_path,
        mask_token=None,
        sep_token=None,
        cls_token=None,
        unknown_token='<unk>',
        padding_token='<pad>',
        bos_token='<s>',
        eos_token='</s>')

    tok_path = get_tokenizer()
    model, vocab = kogpt2model, vocab_b_obj
    tok = SentencepieceTokenizer(tok_path)

    if loops:
        num = 1
    else:
        num = 0

    while 1:
        sent = ''
        if tmp_sent == "":
            tmp_sent = input('input : ')
        sent = sent + tmp_sent

        toked = tok(sent)

        if len(toked) > 1022:
            break

        sent = sample_sequence(model, tok, vocab, sent, text_size, temperature,
                               top_p, top_k)
        sent = sent.replace("<unused0>", "\n")  # 비효율적이지만 엔터를 위해서 등장
        sent = auto_enter(sent)
        print(sent)

        now = [int(n) for n in os.listdir("./samples")]
        if len(now) == 0:
            now = 0
        else:
            now = max(now)
        #now = max(now)
        f = open("samples/" + str(now + 1), 'w', encoding="utf-8")
        head = [load_path, tmp_sent, text_size, temperature, top_p, top_k]
        head = [str(h) for h in head]
        f.write(",".join(head))
        f.write("\n")
        f.write(sent)
        f.close()

        tmp_sent = ""

        if num != 0:
            num += 1
            if num >= loops:
                print("good")
                return
示例#22
0
import os
import torch
import platform
import sentencepiece
from kogpt2.utils import get_tokenizer
from kogpt2.pytorch_kogpt2 import get_pytorch_kogpt2_model
from flask import Flask, request, jsonify, __version__ as flaskver

tok_path = get_tokenizer(cachedir='./bin/')
model, vocab = get_pytorch_kogpt2_model(cachedir='./bin/')
tok = sentencepiece.SentencePieceProcessor(tok_path)

app = Flask(__name__)
port = int(os.getenv('port', '8080'))


@app.route('/', methods=['GET'])
def root():
    env = {
        'python': platform.python_version(),
        'flask': flaskver,
        'pytorch': torch.__version__
    }
    urls = {
        'original': 'https://github.com/SKT-AI/KoGPT2',
        'fork': 'https://github.com/pmh-only/KoGPT2'
    }
    usage = 'GET /job?query=<sentence>[&loop=<loopLimit>]'
    return jsonify(label='kogpt2', urls=urls, env=env, usage=usage)

示例#23
0
def main(temperature=0.7,
         top_p=0.8,
         top_k=40,
         tmp_sent="",
         text_size=100,
         loops=-1,
         load_path='./checkpoint/KoGPT2_checkpoint_long.tar',
         ctx='cpu',
         cachedir='~/kogpt2/',
         samples="./samples"):

    pytorch_kogpt2 = {
        'url':
        'https://kobert.blob.core.windows.net/models/kogpt2/pytorch/pytorch_kogpt2_676e9bcfa7.params',
        'fname': 'pytorch_kogpt2_676e9bcfa7.params',
        'chksum': '676e9bcfa7'
    }

    kogpt2_config = {
        "initializer_range": 0.02,
        "layer_norm_epsilon": 1e-05,
        "n_ctx": 1024,
        "n_embd": 768,
        "n_head": 12,
        "n_layer": 12,
        "n_positions": 1024,
        "vocab_size": 50000
    }

    model_info = pytorch_kogpt2
    model_path = download(model_info['url'],
                          model_info['fname'],
                          model_info['chksum'],
                          cachedir=cachedir)

    vocab_info = tokenizer
    vocab_path = download(vocab_info['url'],
                          vocab_info['fname'],
                          vocab_info['chksum'],
                          cachedir=cachedir)

    device = torch.device(ctx)

    # 저장한 Checkpoint 불러오기
    checkpoint = torch.load(load_path, map_location=device)

    # KoGPT-2 언어 모델 학습을 위한 GPT2LMHeadModel 선언
    kogpt2model = GPT2LMHeadModel(config=GPT2Config.from_dict(kogpt2_config))
    kogpt2model.load_state_dict(checkpoint['model_state_dict'])

    kogpt2model.eval()
    vocab_b_obj = gluonnlp.vocab.BERTVocab.from_sentencepiece(
        vocab_path,
        mask_token=None,
        sep_token=None,
        cls_token=None,
        unknown_token='<unk>',
        padding_token='<pad>',
        bos_token='<s>',
        eos_token='</s>')

    tok_path = get_tokenizer()

    model, vocab = kogpt2model, vocab_b_obj
    vocab.token_to_idx["\n"] = vocab.token_to_idx["<unused0>"]
    del vocab.token_to_idx["<unused0>"]

    tok = SentencepieceTokenizer(tok_path)
    num = 0

    sent_dict = {}

    if loops != -1:
        num = 1

    while 1:
        sent = ''
        if tmp_sent == "":
            tmp_sent = input('input : ')
        sent = sent + tmp_sent

        toked = tok(sent)

        if len(toked) > 1022:
            break

        sent = sample_sequence(model, tok, vocab, sent, text_size, temperature,
                               top_p, top_k)
        sent = sent.replace("<unused0>", "\n")  # 비효율적이지만 엔터를 위해서 등장
        sent = auto_enter(sent)
        # print(sent)

        sent_dict[num] = sent
        now = [int(n) for n in os.listdir(samples)]
        now = max(now)
        f = open(samples + str(now + 1), 'w', encoding="utf-8")
        f.write(sent)
        f.close()

        if num:
            num += 1
            if num >= loops:
                print("good")
                return sent_dict
示例#24
0
def main(args):
    # toker = GPT2Tokenizer.from_pretrained('gpt2')
    tok_path = get_tokenizer()
    toker = SentencepieceTokenizer(tok_path)
    _, vocab = get_pytorch_kogpt2_model()
    attrs = []
    if args.reverse:
        attrs.append('reverse')
    if args.two_turn:
        attrs.append('2turn')
    if attrs:
        db_path = (f'{args.corpus[:-4]}.{args.max_seq_len}len.'
                   f'{".".join(attrs)}.db/db')
    else:
        db_path = f'{args.corpus[:-4]}.{args.max_seq_len}len.db/db'
    if exists(dirname(db_path)):
        raise ValueError('Found existing DB, please backup')
    else:
        os.makedirs(dirname(db_path))
    with shelve.open(db_path, 'n') as db:
        # reader = open(args.corpus, "r", encoding="utf-8")
        reader = pd.read_csv(args.corpus, sep='\t', header=None)
        chunk = []
        n_chunk = 0
        n_example = 0

        # print("pdb-attach")
        # from pdb_clone import pdb
        # rsock = pdb.set_trace_remote()
        #
        # if rsock.state != rsock.ST_CONNECTED:
        #   input()

        for _, line in tqdm(reader.iterrows(), total=len(reader.index)):
            try:
                if len(chunk) >= args.chunk_size:
                    # save and renew chunk
                    db[f'chunk_{n_chunk}'] = gzip.compress(
                        json.dumps(chunk[:args.chunk_size]).encode('utf-8'))
                    chunk = chunk[args.chunk_size:]
                    n_chunk += 1

                weights, inputs = _get_inputs_from_text(line, toker, vocab)
                if args.reverse:
                    weights = list(reversed(weights))
                    inputs = list(reversed(inputs))
                if args.two_turn:
                    weights = weights[:2]
                    inputs = inputs[:2]
                if len(weights) < 2:
                    continue
                features = _make_features(n_example, weights, inputs, toker,
                                          vocab, args.max_seq_len)
                for feature in features:
                    chunk.append(vars(feature))
                    n_example += 1
            except Exception as e:
                print('!!! prepro exception !!!', e)
                continue
        # save last chunk
        db[f'chunk_{n_chunk}'] = gzip.compress(
            json.dumps(chunk).encode('utf-8'))
    # save relevant information to reproduce
    meta = {
        'n_example': n_example,
        'chunk_size': args.chunk_size,
        'max_seq_len': args.max_seq_len,
        'reverse': args.reverse,
        'two_turn': args.two_turn
    }
    with open(join(dirname(db_path), 'meta.json'), 'w') as writer:
        json.dump(meta, writer, indent=4)
def main(temperature=0.7,
         top_p=0.8,
         top_k=40,
         tmp_sent="",
         text_size=100,
         loops=-1,
         load_path='./checkpoint/KoGPT2_checkpoint_long.tar',
         ctx='cuda',
         cachedir='~/kogpt2/',
         samples="./gdrive/My Drive/KoGPT2-FineTuning_pre/samples/"):
    pytorch_kogpt2 = {
        'url':
        'https://kobert.blob.core.windows.net/models/kogpt2/pytorch/pytorch_kogpt2_676e9bcfa7.params',
        'fname': 'pytorch_kogpt2_676e9bcfa7.params',
        'chksum': '676e9bcfa7'
    }

    kogpt2_config = {
        "initializer_range": 0.02,
        "layer_norm_epsilon": 1e-05,
        "n_ctx": 1024,
        "n_embd": 768,
        "n_head": 12,
        "n_layer": 12,
        "n_positions": 1024,
        "vocab_size": 50000
    }

    model_info = pytorch_kogpt2
    model_path = download(model_info['url'],
                          model_info['fname'],
                          model_info['chksum'],
                          cachedir=cachedir)

    vocab_info = tokenizer
    vocab_path = download(vocab_info['url'],
                          vocab_info['fname'],
                          vocab_info['chksum'],
                          cachedir=cachedir)

    device = torch.device(ctx)

    # 저장한 Checkpoint 불러오기
    checkpoint = torch.load(load_path, map_location=device)

    # KoGPT-2 언어 모델 학습을 위한 GPT2LMHeadModel 선언
    kogpt2model = GPT2LMHeadModel(config=GPT2Config.from_dict(kogpt2_config))
    kogpt2model.load_state_dict(checkpoint['model_state_dict'])

    kogpt2model.eval()
    vocab_b_obj = gluonnlp.vocab.BERTVocab.from_sentencepiece(
        vocab_path,
        mask_token=None,
        sep_token=None,
        cls_token=None,
        unknown_token='<unk>',
        padding_token='<pad>',
        bos_token='<s>',
        eos_token='</s>')

    tok_path = get_tokenizer()

    model, vocab = kogpt2model, vocab_b_obj
    tok = SentencepieceTokenizer(tok_path)

    try:
        load_path.split("/")[-2]
    except:
        print("path error")
    else:
        load_path = load_path.split("/")[-2]

    print("ok : ", load_path)
    while (True):
        sent = input()
        make_sentence(model, tok, vocab, sent, text_size, temperature, top_p,
                      top_k, loops)
示例#26
0
def train():
    tok_path = get_tokenizer()
    model, vocab = get_mxnet_kogpt2_model(ctx=ctx)
    # tok = SentencepieceTokenizer(tok_path, num_best=0, alpha=0)

    data = pd.read_csv('Chatbot_data/ChatbotData.csv')

    max_len = opt.max_seq_len
    train_set = chat_data(data, tok_path, vocab, max_len=max_len)
    batch_size = opt.batch_size

    train_dataloader = mx.gluon.data.DataLoader(train_set,
                                                batch_size=batch_size,
                                                num_workers=5,
                                                shuffle=True)
    kogptqa = KoGPT2Chat(model)
    kogptqa.hybridize()

    # softmax cross entropy loss for classification
    loss_function = gluon.loss.SoftmaxCrossEntropyLoss()
    loss_function.hybridize()

    num_epochs = opt.num_epoch
    lr = 5e-5
    trainer = gluon.Trainer(kogptqa.collect_params(), 'bertadam', {
        'learning_rate': lr,
        'epsilon': 1e-8,
        'wd': 0.01
    })
    # LayerNorm과 Bias에는 Weight Decay를 적용하지 않는다.
    for _, v in kogptqa.collect_params('.*beta|.*gamma|.*bias').items():
        v.wd_mult = 0.0
    params = [
        p for p in kogptqa.collect_params().values() if p.grad_req != 'null'
    ]
    # learning rate warmup
    accumulate = opt.accumulate
    step_size = batch_size * accumulate if accumulate else batch_size
    num_train_examples = len(train_set)
    num_train_steps = int(num_train_examples / step_size * num_epochs)
    warmup_ratio = 0.1
    num_warmup_steps = int(num_train_steps * warmup_ratio)
    step_num = 0
    all_model_params = kogptqa.collect_params()

    log_interval = 50
    neg = -1e18
    # Set grad_req if gradient accumulation is required
    if accumulate and accumulate > 1:
        for p in params:
            p.grad_req = 'add'

    for epoch_id in range(num_epochs):
        step_loss = 0
        for batch_id, (token_ids, mask, label) in enumerate(train_dataloader):
            if step_num < num_warmup_steps:
                new_lr = lr * step_num / num_warmup_steps
            else:
                non_warmup_steps = step_num - num_warmup_steps
                offset = non_warmup_steps / (num_train_steps -
                                             num_warmup_steps)
                new_lr = lr - offset * lr
            trainer.set_learning_rate(new_lr)
            with mx.autograd.record():
                # load data to GPU or GPU
                token_ids = token_ids.as_in_context(ctx)
                mask = mask.as_in_context(ctx)
                label = label.as_in_context(ctx)
                # forward computation
                out = kogptqa(token_ids)
                masked_out = nd.where(
                    mask.expand_dims(axis=2).repeat(repeats=out.shape[2],
                                                    axis=2), out,
                    neg * nd.ones_like(out))
                # loss for responses exincluding MASK and PAD
                ls = loss_function(masked_out, label).sum() / mask.sum()
            # backward computation
            ls.backward()
            if not accumulate or (batch_id + 1) % accumulate == 0:
                trainer.allreduce_grads()
                nlp.utils.clip_grad_global_norm(params, 1)
                trainer.update(accumulate if accumulate else 1)
                step_num += 1
                if accumulate and accumulate > 1:
                    # set grad to zero for gradient accumulation
                    all_model_params.zero_grad()
            step_loss += ls.asscalar()
            if step_num % log_interval == 0 and step_num > 0:
                print(
                    '[Epoch {} Batch {}/{}] loss={:.4f}, lr={:.10f}, train ppl={:.3f}'
                    .format(epoch_id + 1, batch_id + 1, len(train_dataloader),
                            step_loss / log_interval, trainer.learning_rate,
                            math.exp(step_loss / log_interval)))
                step_loss = 0
    logging.info('saving model file to {}'.format(opt.model_params))
    kogptqa.save_parameters(opt.model_params)
示例#27
0
def main(epoch, save_path, load_path, samples, data_file_path, batch_size):
    ctx = 'cuda'
    cachedir = '~/kogpt2/'

    summary = SummaryWriter()

    # download model
    model_info = pytorch_kogpt2
    model_path = download(model_info['url'],
                          model_info['fname'],
                          model_info['chksum'],
                          cachedir=cachedir)
    # download vocab
    vocab_info = tokenizer
    vocab_path = download(vocab_info['url'],
                          vocab_info['fname'],
                          vocab_info['chksum'],
                          cachedir=cachedir)

    # KoGPT-2 언어 모델 학습을 위한 GPT2LMHeadModel 선언
    kogpt2model = GPT2LMHeadModel(config=GPT2Config.from_dict(kogpt2_config))

    # model_path 로부터 다운로드 받은 내용을 load_state_dict 으로 업로드
    kogpt2model.load_state_dict(torch.load(model_path))

    device = torch.device(ctx)
    kogpt2model.to(device)

    # 불러오기 부분
    try:
        checkpoint = torch.load(load_path, map_location=device)

        # KoGPT-2 언어 모델 학습을 위한 GPT2LMHeadModel 선언
        kogpt2model = GPT2LMHeadModel(
            config=GPT2Config.from_dict(kogpt2_config))
        kogpt2model.load_state_dict(checkpoint['model_state_dict'])

        kogpt2model.eval()
    except:
        count = 0
    else:
        count = int(re.findall("\d+", load_path)[1])

    print(count)
    # 추가로 학습하기 위해 .train() 사용
    kogpt2model.train()
    vocab_b_obj = gluonnlp.vocab.BERTVocab.from_sentencepiece(
        vocab_path,
        mask_token=None,
        sep_token=None,
        cls_token=None,
        unknown_token='<unk>',
        padding_token='<pad>',
        bos_token='<s>',
        eos_token='</s>')

    tok_path = get_tokenizer()
    model, vocab = kogpt2model, vocab_b_obj
    tok = SentencepieceTokenizer(tok_path)

    dataset = Read_Dataset(data_file_path, vocab, tok)
    data_loader = DataLoader(dataset,
                             batch_size=batch_size,
                             shuffle=True,
                             pin_memory=True)

    learning_rate = 3e-5
    criterion = torch.nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

    print('KoGPT-2 Transfer Learning Start')
    avg_loss = (0.0, 0.0)

    for epoch in range(epoch):
        for data in data_loader:
            optimizer.zero_grad()
            data = torch.stack(
                data)  # list of Tensor로 구성되어 있기 때문에 list를 stack을 통해 변환해준다.
            data = data.transpose(1, 0)
            data = data.to(ctx)
            model = model.to(ctx)

            outputs = model(data, labels=data)
            loss, logits = outputs[:2]
            loss = loss.to(ctx)
            loss.backward()
            avg_loss = (avg_loss[0] * 0.99 + loss, avg_loss[1] * 0.99 + 1.0)
            optimizer.step()
            if count % 10 == 0:
                print(
                    'epoch no.{0} train no.{1}  loss = {2:.5f} avg_loss = {3:.5f}'
                    .format(epoch, count, loss, avg_loss[0] / avg_loss[1]))
                summary.add_scalar('loss/avg_loss', avg_loss[0] / avg_loss[1],
                                   count)
                summary.add_scalar('loss/loss', loss, count)

            # generator 진행
            if (count > 0 and count % 1000 == 0) or (len(data) < batch_size):
                sent = sample_sequence(model.to("cpu"),
                                       tok,
                                       vocab,
                                       sent="사랑",
                                       text_size=100,
                                       temperature=0.7,
                                       top_p=0.8,
                                       top_k=40)
                sent = sent.replace("<unused0>", "\n")
                print(sent)

                summary.add_text('Text', sent, count)

                if count > 500000:
                    now = [int(n) for n in os.listdir(samples)]
                    now = max(now)
                    f = open(samples + str(now + 1), 'w', encoding="utf-8")
                    f.write(sent)
                    f.close()
            #########################################
            count += 1

            if (count > 0 and count % 10000 == 0) or (len(data) < batch_size):
                # 모델 저장
                try:
                    torch.save(
                        {
                            'epoch': epoch,
                            'train_no': count,
                            'model_state_dict': model.state_dict(),
                            'optimizer_state_dict': optimizer.state_dict(),
                            'loss': loss
                        },
                        save_path + 'KoGPT2_checkpoint_' + str(count) + '.tar')
                except:
                    pass
示例#28
0
def main(epoch = 200, save_path = './checkpoint/', load_path = './checkpoint/KoGPT2_checkpoint_long.tar',
		data_file_path = 'dataset/lyrics_dataset.txt', batch_size = 8, summary_url = 'runs/', new = 0, text_size = 100):
	ctx = 'cuda'
	cachedir = '~/kogpt2/'
	summary = SummaryWriter(summary_url)

	pytorch_kogpt2 = {
		'url': 'https://kobert.blob.core.windows.net/models/kogpt2/pytorch/pytorch_kogpt2_676e9bcfa7.params',
		'fname': 'pytorch_kogpt2_676e9bcfa7.params',
		'chksum': '676e9bcfa7'
	}
	kogpt2_config = {
		"initializer_range": 0.02,
		"layer_norm_epsilon": 1e-05,
		"n_ctx": 1024,
		"n_embd": 768,
		"n_head": 12,
		"n_layer": 12,
		"n_positions": 1024,
		"vocab_size": 50000
	}

	# download model
	model_info = pytorch_kogpt2
	model_path = download(model_info['url'],
						   model_info['fname'],
						   model_info['chksum'],
						   cachedir=cachedir)
	# download vocab
	vocab_info = tokenizer
	vocab_path = download(vocab_info['url'],
						   vocab_info['fname'],
						   vocab_info['chksum'],
						   cachedir=cachedir)

	# KoGPT-2 언어 모델 학습을 위한 GPT2LMHeadModel 선언
	kogpt2model = GPT2LMHeadModel(config=GPT2Config.from_dict(kogpt2_config))

	# model_path 로부터 다운로드 받은 내용을 load_state_dict 으로 업로드
	kogpt2model.load_state_dict(torch.load(model_path))

	device = torch.device(ctx)
	kogpt2model.to(device)
	count = 0
	# 불러오기 부분
	try:
		checkpoint = torch.load(load_path, map_location=device)

		# KoGPT-2 언어 모델 학습을 위한 GPT2LMHeadModel 선언
		kogpt2model = GPT2LMHeadModel(config=GPT2Config.from_dict(kogpt2_config))
		kogpt2model.load_state_dict(checkpoint['model_state_dict'])

		kogpt2model.eval()
	except:
		print("count 0 : ", load_path)
	else:
		print("count check : ",re.findall("\d+", load_path))
		count = max([int(i) for i in (re.findall("\d+", load_path))])

	if new:
		count = 0
	# 추가로 학습하기 위해 .train() 사용
	kogpt2model.train()
	vocab_b_obj = gluonnlp.vocab.BERTVocab.from_sentencepiece(vocab_path,
								mask_token=None,
								sep_token=None,
								cls_token=None,
								unknown_token='<unk>',
								padding_token='<pad>',
								bos_token='<s>',
								eos_token='</s>')

	tok_path = get_tokenizer()
	model, vocab = kogpt2model, vocab_b_obj
	sentencepieceTokenizer = SentencepieceTokenizer(tok_path)

	dataset = Read_Dataset(data_file_path, vocab, sentencepieceTokenizer)
	data_loader = DataLoader(dataset, batch_size=batch_size, shuffle=True, pin_memory=True)

	learning_rate = 3e-5
	criterion = torch.nn.CrossEntropyLoss()
	optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

	## train
	# vocab.token_to_idx["\n"] = vocab.token_to_idx["<unused0>"]
	# del vocab.token_to_idx["<unused0>"]
	# vocab.token_to_idx["<|endoftext|>"] = vocab.token_to_idx["<unused1>"]
	# del vocab.token_to_idx["<unused1>"]

	model = model.to(ctx)
	tok = SentencepieceTokenizer(tok_path)

	print('KoGPT-2 Transfer Learning Start')
	avg_loss = (0.0, 0.0)
	for epoch in range(epoch):
		for data in data_loader:
			optimizer.zero_grad()
			data = torch.stack(data) # list of Tensor로 구성되어 있기 때문에 list를 stack을 통해 변환해준다.
			data = data.transpose(1,0)
			data = data.to(ctx)
			model = model.to(ctx)

			outputs = model(data, labels=data)
			loss, logits = outputs[:2]
			loss = loss.to(ctx)
			loss.backward()
			avg_loss = (avg_loss[0] * 0.99 + loss, avg_loss[1] * 0.99 + 1.0)
			optimizer.step()

			if count % 10 == 0:
				print('epoch no.{0} train no.{1}  loss = {2:.5f} avg_loss = {3:.5f}' . format(epoch, count, loss, avg_loss[0] / avg_loss[1]))
				summary.add_scalar('loss/avg_loss', avg_loss[0] / avg_loss[1], count)
				summary.add_scalar('loss/loss', loss, count)
				# print("save")
				# torch.save({
				# 	'epoch': epoch,
				# 	'train_no': count,
				# 	'model_state_dict': model.state_dict(),
				# 	'optimizer_state_dict': optimizer.state_dict(),
				# 	'loss': loss
				# }, save_path + 'KoGPT2_checkpoint_' + str(count) + '.tar')

				#generator 진행
				if (count > 0 and count % 1000 == 0) or (len(data) < batch_size):
					sent = sample_sequence(model.to("cpu"), tok, vocab, sent="성실", text_size=text_size, temperature=0.7, top_p=0.8, top_k=40)
					sent = sent.replace("<unused0>", "\n") # 비효율적이지만 엔터를 위해서 등장
					sent = auto_enter(sent)
					print(sent)
					summary.add_text('Text', sent, count)
					del sent
					pass

			#########################################
			if (count > 0 and count % 18500 == 0):
				# 모델 저장
				try:
					torch.save({
						'epoch': epoch,
						'train_no': count,
						'model_state_dict': model.state_dict(),
						'optimizer_state_dict': optimizer.state_dict(),
						'loss': loss
					}, save_path + 'KoGPT2_checkpoint_' + str(count) + '.tar')
				except:
					pass
			count += 1
示例#29
0
 def __init__(self):
     self.tok_path = get_tokenizer()
     self.tok = SentencepieceTokenizer( self.tok_path, num_best=0, alpha=0)
    def __init__(self, vocab, MAX_LEN=1024):
        self.q_token = U_TKN # BOS os Q
        self.a_token = S_TKN # BOS os A
        self.bos = BOS
        self.eos = EOS
        self.maskt = MASK
        self.sent_token = SENT
        #-----------------------------------

        self.folder_path = "./TK_data/TT_data"
        self.CONTEXT_IN = []
        self.MASK_IN = []
        self.LABELS_IN = []
        self.MAX_LEN = MAX_LEN

        #self.DATA = pd.read_csv('./TK_data/Chatbot_data/ChatbotData.csv')
        self._tok_path = get_tokenizer()
        self.tokenizer = None
        if self.tokenizer is None:
            self._activate_sp()
        self.first = True

        self.vocab = vocab
        self.padder = nlp.data.PadSequence(
            MAX_LEN, pad_val=self.vocab[self.vocab.padding_token])

        #==========================================================
        for file_path in glob.glob(self.folder_path + "/*.txt"):
            file = open(file_path, 'r', encoding='utf-8')

            even_or_odd = 0;
            CONTEXT_IN = []
            MASK_IN = []
            LABELS_IN = []
            while True:
                data = file.readline()
                print("\n\n\nTK: {}\n\n\n".format(data))
                if not data:
                    break
                q_toked = self.tokenizer(data[:-1])
                #print("S : {}\n".format(data[:-1]))
                if even_or_odd % 2 == 0 :  
                    CONTEXT_IN_TEMP = [self.q_token] + q_toked + [self.eos]
                    CONTEXT_IN += CONTEXT_IN_TEMP
                    MASK_IN += [0] * len(CONTEXT_IN_TEMP)
                    LABELS_IN += [self.maskt] * len(CONTEXT_IN_TEMP)
                else : 
                    CONTEXT_IN_TEMP = [self.a_token] + q_toked + [self.eos]
                    CONTEXT_IN += CONTEXT_IN_TEMP
                    MASK_IN += [1] * len(CONTEXT_IN_TEMP)
                    LABELS_IN += CONTEXT_IN_TEMP
                even_or_odd += 1
            #print("I : {}\n".format(CONTEXT_IN))
            #print("I2 : {}\n".format(MASK_IN))
            CONTEXT_LEN = len(CONTEXT_IN)
            if CONTEXT_LEN  > self.MAX_LEN:
                raise Exception('None expected CONTEXT_LEN : {}'.format(CONTEXT_LEN))
            pad_token_len = MAX_LEN - CONTEXT_LEN
            MASK_IN += [0] * pad_token_len
    
            self.CONTEXT_IN.append(CONTEXT_IN)
            # [0, 0, 0, 0, ....., 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, .... ]
            self.MASK_IN.append(MASK_IN)
            # [mask, mask, ...., mask, ..., <bos>,..A.. <eos>, <pad>....]
            self.LABELS_IN.append(LABELS_IN)
            file.close()