예제 #1
0
def main(epoch, save_path, load_path, samples, data_file_path, batch_size):
    ctx = 'cuda'
    cachedir = '~/kogpt2/'

    summary = SummaryWriter()

    # download model
    model_info = pytorch_kogpt2
    model_path = download(model_info['url'],
                          model_info['fname'],
                          model_info['chksum'],
                          cachedir=cachedir)
    # download vocab
    vocab_info = tokenizer
    vocab_path = download(vocab_info['url'],
                          vocab_info['fname'],
                          vocab_info['chksum'],
                          cachedir=cachedir)

    # KoGPT-2 언어 모델 학습을 위한 GPT2LMHeadModel 선언
    kogpt2model = GPT2LMHeadModel(config=GPT2Config.from_dict(kogpt2_config))

    # model_path 로부터 다운로드 받은 내용을 load_state_dict 으로 업로드
    kogpt2model.load_state_dict(torch.load(model_path))

    device = torch.device(ctx)
    kogpt2model.to(device)

    # 불러오기 부분
    try:
        checkpoint = torch.load(load_path, map_location=device)

        # KoGPT-2 언어 모델 학습을 위한 GPT2LMHeadModel 선언
        kogpt2model = GPT2LMHeadModel(
            config=GPT2Config.from_dict(kogpt2_config))
        kogpt2model.load_state_dict(checkpoint['model_state_dict'])

        kogpt2model.eval()
    except:
        count = 0
    else:
        count = int(re.findall("\d+", load_path)[1])

    print(count)
    # 추가로 학습하기 위해 .train() 사용
    kogpt2model.train()
    vocab_b_obj = gluonnlp.vocab.BERTVocab.from_sentencepiece(
        vocab_path,
        mask_token=None,
        sep_token=None,
        cls_token=None,
        unknown_token='<unk>',
        padding_token='<pad>',
        bos_token='<s>',
        eos_token='</s>')

    tok_path = get_tokenizer()
    model, vocab = kogpt2model, vocab_b_obj
    tok = SentencepieceTokenizer(tok_path)

    dataset = Read_Dataset(data_file_path, vocab, tok)
    data_loader = DataLoader(dataset,
                             batch_size=batch_size,
                             shuffle=True,
                             pin_memory=True)

    learning_rate = 3e-5
    criterion = torch.nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

    print('KoGPT-2 Transfer Learning Start')
    avg_loss = (0.0, 0.0)

    for epoch in range(epoch):
        for data in data_loader:
            optimizer.zero_grad()
            data = torch.stack(
                data)  # list of Tensor로 구성되어 있기 때문에 list를 stack을 통해 변환해준다.
            data = data.transpose(1, 0)
            data = data.to(ctx)
            model = model.to(ctx)

            outputs = model(data, labels=data)
            loss, logits = outputs[:2]
            loss = loss.to(ctx)
            loss.backward()
            avg_loss = (avg_loss[0] * 0.99 + loss, avg_loss[1] * 0.99 + 1.0)
            optimizer.step()
            if count % 10 == 0:
                print(
                    'epoch no.{0} train no.{1}  loss = {2:.5f} avg_loss = {3:.5f}'
                    .format(epoch, count, loss, avg_loss[0] / avg_loss[1]))
                summary.add_scalar('loss/avg_loss', avg_loss[0] / avg_loss[1],
                                   count)
                summary.add_scalar('loss/loss', loss, count)

            # generator 진행
            if (count > 0 and count % 1000 == 0) or (len(data) < batch_size):
                sent = sample_sequence(model.to("cpu"),
                                       tok,
                                       vocab,
                                       sent="사랑",
                                       text_size=100,
                                       temperature=0.7,
                                       top_p=0.8,
                                       top_k=40)
                sent = sent.replace("<unused0>", "\n")
                print(sent)

                summary.add_text('Text', sent, count)

                if count > 500000:
                    now = [int(n) for n in os.listdir(samples)]
                    now = max(now)
                    f = open(samples + str(now + 1), 'w', encoding="utf-8")
                    f.write(sent)
                    f.close()
            #########################################
            count += 1

            if (count > 0 and count % 10000 == 0) or (len(data) < batch_size):
                # 모델 저장
                try:
                    torch.save(
                        {
                            'epoch': epoch,
                            'train_no': count,
                            'model_state_dict': model.state_dict(),
                            'optimizer_state_dict': optimizer.state_dict(),
                            'loss': loss
                        },
                        save_path + 'KoGPT2_checkpoint_' + str(count) + '.tar')
                except:
                    pass
예제 #2
0
def main(epoch=200,
         save_path='./checkpoint/',
         load_path='./checkpoint/KoGPT2_checkpoint_long.tar',
         data_file_path='dataset/lyrics_dataset.txt',
         batch_size=8,
         summary_url='runs/',
         new=0,
         text_size=100):
    ctx = 'cuda'
    cachedir = '~/kogpt2/'
    summary = SummaryWriter(summary_url)

    pytorch_kogpt2 = {
        'url':
        'https://kobert.blob.core.windows.net/models/kogpt2/pytorch/pytorch_kogpt2_676e9bcfa7.params',
        'fname': 'pytorch_kogpt2_676e9bcfa7.params',
        'chksum': '676e9bcfa7'
    }
    kogpt2_config = {
        "initializer_range": 0.02,
        "layer_norm_epsilon": 1e-05,
        "n_ctx": 1024,
        "n_embd": 768,
        "n_head": 12,
        "n_layer": 12,
        "n_positions": 1024,
        "vocab_size": 50000
    }

    # download model
    model_info = pytorch_kogpt2
    model_path = download(model_info['url'],
                          model_info['fname'],
                          model_info['chksum'],
                          cachedir=cachedir)
    # download vocab
    vocab_info = tokenizer
    vocab_path = download(vocab_info['url'],
                          vocab_info['fname'],
                          vocab_info['chksum'],
                          cachedir=cachedir)

    # KoGPT-2 언어 모델 학습을 위한 GPT2LMHeadModel 선언
    kogpt2model = GPT2LMHeadModel(config=GPT2Config.from_dict(kogpt2_config))

    # model_path 로부터 다운로드 받은 내용을 load_state_dict 으로 업로드
    # 기본 모델에서 가져오는 파라미터 업데이트
    kogpt2model.load_state_dict(torch.load(model_path))

    device = torch.device(ctx)  #GPU
    kogpt2model.to(device)
    count = 0

    # 체크포인트에서 불러오기 부분
    try:
        checkpoint = torch.load(load_path, map_location=device)

        # KoGPT-2 언어 모델 학습을 위한 GPT2LMHeadModel 선언
        kogpt2model = GPT2LMHeadModel(
            config=GPT2Config.from_dict(kogpt2_config))
        kogpt2model.load_state_dict(checkpoint['model_state_dict'])

        kogpt2model.eval()
    except:
        print("count 0 : ", load_path)
    else:
        print("count check : ", re.findall("\d+", load_path))
        count = max([int(i) for i in (re.findall("\d+", load_path))])

    if new:
        count = 0

    # 추가로 학습하기 위해 .train() 사용
    kogpt2model.train()
    vocab_b_obj = gluonnlp.vocab.BERTVocab.from_sentencepiece(
        vocab_path,
        mask_token=None,
        sep_token=None,
        cls_token=None,
        unknown_token='<unk>',
        padding_token='<pad>',
        bos_token='<s>',
        eos_token='</s>')

    tok_path = get_tokenizer()
    model, vocab = kogpt2model, vocab_b_obj
    sentencepieceTokenizer = SentencepieceTokenizer(tok_path)

    # 우리의 데이터셋 불러오는 부분
    dataset = Read_Dataset(data_file_path, vocab, sentencepieceTokenizer)
    data_loader = DataLoader(dataset,
                             batch_size=batch_size,
                             shuffle=True,
                             pin_memory=True)

    # 체크
    learning_rate = 3e-5
    criterion = torch.nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

    model = model.to(ctx)
    # bpe로 할 때 나누고 합치고 하는 과정이 편해짐.
    tok = SentencepieceTokenizer(tok_path)

    print('KoGPT-2 Transfer Learning Start')

    # 장르별로 체크포인트 폴더 없으면 생성하기
    try:
        if not (os.path.isdir(save_path + data_file_path.split("/")[-1][:-4])):
            os.makedirs(
                os.path.join(save_path + data_file_path.split("/")[-1][:-4]))
    except OSError as e:
        if e.errno != errno.EEXIST:
            print("Failed to create directory!!!!!")
            raise

    avg_loss = (0.0, 0.0)
    for epoch in range(epoch):
        # 데이터셋 가져와서 학습 시작
        for datas in data_loader:
            data = datas[0]

            optimizer.zero_grad()
            data = torch.stack(
                data)  # list of Tensor로 구성되어 있기 때문에 list를 stack을 통해 변환해준다.
            data = data.transpose(1, 0)
            data = data.to(ctx)
            model = model.to(ctx)

            # 실제 학습
            outputs = model(data, labels=data)
            loss, logits = outputs[:2]

            nowloss = copy.copy(loss)
            # 평균 loss 만들기 avg_loss[0] / avg_loss[1] <- loss 정규화
            avg_loss = (avg_loss[0] * 0.99 + loss, avg_loss[1] * 0.99 + 1.0)

            loss *= datas[2][0]  # 특별 socre 부분

            loss = loss.to(ctx)
            loss.backward()

            # 학습 끝
            optimizer.step()

            if count % 10 == 0:
                print(
                    'epoch no.{0} train no.{1}  loss = {2:.5f} avg_loss = {3:.5f}'
                    .format(epoch, count, loss, avg_loss[0] / avg_loss[1]))
                summary.add_scalar('loss/avg_loss', avg_loss[0] / avg_loss[1],
                                   count)
                summary.add_scalar('loss/loss', loss, count)
                # print("save")
                # torch.save({
                # 	'epoch': epoch,
                # 	'train_no': count,
                # 	'model_state_dict': model.state_dict(),
                # 	'optimizer_state_dict': optimizer.state_dict(),
                # 	'loss': loss
                # }, save_path + 'KoGPT2_checkpoint_' + str(count) + '.tar')

                #generator 진행
                if (count > 0 and count % 2500 == 0):
                    sent = sample_sequence(model.to("cpu"),
                                           tok,
                                           vocab,
                                           sent="가",
                                           text_size=text_size,
                                           temperature=0.7,
                                           top_p=0.9,
                                           top_k=100)
                    sent = sent.replace("//", "\n")  # 비효율적이지만 엔터를 위해서 등장
                    sent = auto_enter(sent)
                    print(sent)
                    summary.add_text('Text', sent, count)
                    del sent
                    pass

            #########################################
            if (count > 0 and count % 10000 == 0):
                print("모델을 저장합니다.")
                # 모델 저장
                try:
                    torch.save(
                        {
                            'epoch': epoch,
                            'train_no': count,
                            'model_state_dict': model.state_dict(),
                            'optimizer_state_dict': optimizer.state_dict(),
                            'loss': loss
                        }, save_path + data_file_path.split("/")[-1][:-4] +
                        '/' + 'KoGPT2_checkpoint_' + str(count) + '.tar')

                    #print("문제 시작")

                    # 드롭박스에 저장
                    large_file = open(
                        save_path + data_file_path.split("/")[-1][:-4] + '/' +
                        'KoGPT2_checkpoint_' + str(count) + '.tar', 'rb')

                    names = 'KoGPT2_checkpoint_' + str(count) + '.tar'

                    # 장르/체크포인트 부분으로 저장
                    large_file_path = '/' + data_file_path.split(
                        "/")[-1][:-4] + '/' + names

                    #print("문제 시작2")

                    CHUNK_SIZE = 1024 * 1024 * 150

                    chunk = large_file.read(CHUNK_SIZE)
                    session_info = dbx.files_upload_session_start(chunk)
                    cursor = dropbox.files.UploadSessionCursor(
                        session_id=session_info.session_id,
                        offset=large_file.tell(),
                    )

                    print("문제 시작3")
                    # 남은 청크들 업로드용 loop
                    while True:
                        chunk = large_file.read(CHUNK_SIZE)

                        if not chunk:
                            dbx.files_upload_session_finish(
                                b'',
                                dropbox.files.UploadSessionCursor(
                                    session_id=session_info.session_id,
                                    offset=large_file.tell(),
                                ),
                                dropbox.files.CommitInfo(
                                    large_file_path,
                                    dropbox.files.WriteMode('add'),
                                ),
                            )
                            break
                        else:
                            # 청크 분할 후 남은 데이터 appending
                            dbx.files_upload_session_append_v2(chunk, cursor)
                            cursor.offset = large_file.tell()
                    logger.warning('학습한 모델 파일 업로드 완료')

                    #print("문제 시작4")

                    # 액세스 토큰 폴더 내 존재하는 폴더/파일 출력
                    logger.warning('대용량 파일 업로드 후 폴더/파일 목록:')
                    for entry in dbx.files_list_folder('').entries:
                        logger.warning("\t" + entry.name)

                    # 파일 삭제
                    #print("문제 시작5")
                    os.remove(save_path + data_file_path.split("/")[-1][:-4] +
                              '/' + 'KoGPT2_checkpoint_' + str(count) + '.tar')

                    # 휴지통 비우기
                    #print("문제 시작6")
                    logging.getLogger('googleapiclient.discovery').setLevel(
                        logging.CRITICAL)

                    for a_file in my_drive.ListFile({
                            'q': "trashed = true"
                    }).GetList():
                        a_file.Delete()

                except:
                    pass

            if avg_loss[0] / avg_loss[1] < 1.0:
                print("학습이 끝났어용!!")
                print("모델을 저장합니다.")
                # 모델 저장
                #try:
                torch.save(
                    {
                        'epoch': epoch,
                        'train_no': count,
                        'model_state_dict': model.state_dict(),
                        'optimizer_state_dict': optimizer.state_dict(),
                        'loss': loss
                    }, save_path + data_file_path.split("/")[-1][:-4] + '/' +
                    'KoGPT2_checkpoint_' + str(count) + '.tar')

                #print("문제 시작")

                # 드롭박스에 저장
                large_file = open(
                    save_path + data_file_path.split("/")[-1][:-4] + '/' +
                    'KoGPT2_checkpoint_' + str(count) + '.tar', 'rb')

                names = 'KoGPT2_checkpoint_' + str(count) + '.tar'

                # 장르/체크포인트 부분으로 저장
                large_file_path = '/' + data_file_path.split(
                    "/")[-1][:-4] + '/' + names

                #print("문제 시작2")

                CHUNK_SIZE = 1024 * 1024 * 150

                chunk = large_file.read(CHUNK_SIZE)
                session_info = dbx.files_upload_session_start(chunk)
                cursor = dropbox.files.UploadSessionCursor(
                    session_id=session_info.session_id,
                    offset=large_file.tell(),
                )

                print("문제 시작3")
                # 남은 청크들 업로드용 loop
                while True:
                    chunk = large_file.read(CHUNK_SIZE)

                    if not chunk:
                        dbx.files_upload_session_finish(
                            b'',
                            dropbox.files.UploadSessionCursor(
                                session_id=session_info.session_id,
                                offset=large_file.tell(),
                            ),
                            dropbox.files.CommitInfo(
                                large_file_path,
                                dropbox.files.WriteMode('add'),
                            ),
                        )
                        break
                    else:
                        # 청크 분할 후 남은 데이터 appending
                        dbx.files_upload_session_append_v2(chunk, cursor)
                        cursor.offset = large_file.tell()
                logger.warning('학습한 모델 파일 업로드 완료')

                #print("문제 시작4")

                # 액세스 토큰 폴더 내 존재하는 폴더/파일 출력
                logger.warning('대용량 파일 업로드 후 폴더/파일 목록:')
                for entry in dbx.files_list_folder('').entries:
                    logger.warning("\t" + entry.name)

                # 파일 삭제
                #print("문제 시작5")
                os.remove(save_path + data_file_path.split("/")[-1][:-4] +
                          '/' + 'KoGPT2_checkpoint_' + str(count) + '.tar')

                # 휴지통 비우기
                #print("문제 시작6")
                logging.getLogger('googleapiclient.discovery').setLevel(
                    logging.CRITICAL)

                for a_file in my_drive.ListFile({
                        'q': "trashed = true"
                }).GetList():
                    a_file.Delete()

                return

            count += 1
예제 #3
0
def main(epoch = 200, save_path = './checkpoint/', load_path = './checkpoint/KoGPT2_checkpoint_long.tar',
		data_file_path = 'dataset/lyrics_dataset.txt', batch_size = 8, summary_url = 'runs/', new = 0, text_size = 100):
	ctx = 'cuda'
	cachedir = '~/kogpt2/'
	summary = SummaryWriter(summary_url)

	pytorch_kogpt2 = {
		'url': 'https://kobert.blob.core.windows.net/models/kogpt2/pytorch/pytorch_kogpt2_676e9bcfa7.params',
		'fname': 'pytorch_kogpt2_676e9bcfa7.params',
		'chksum': '676e9bcfa7'
	}
	kogpt2_config = {
		"initializer_range": 0.02,
		"layer_norm_epsilon": 1e-05,
		"n_ctx": 1024,
		"n_embd": 768,
		"n_head": 12,
		"n_layer": 12,
		"n_positions": 1024,
		"vocab_size": 50000
	}

	# download model
	model_info = pytorch_kogpt2
	model_path = download(model_info['url'],
						   model_info['fname'],
						   model_info['chksum'],
						   cachedir=cachedir)
	# download vocab
	vocab_info = tokenizer
	vocab_path = download(vocab_info['url'],
						   vocab_info['fname'],
						   vocab_info['chksum'],
						   cachedir=cachedir)

	# KoGPT-2 언어 모델 학습을 위한 GPT2LMHeadModel 선언
	kogpt2model = GPT2LMHeadModel(config=GPT2Config.from_dict(kogpt2_config))

	# model_path 로부터 다운로드 받은 내용을 load_state_dict 으로 업로드
	kogpt2model.load_state_dict(torch.load(model_path))

	device = torch.device(ctx)
	kogpt2model.to(device)
	count = 0
	# 불러오기 부분
	try:
		checkpoint = torch.load(load_path, map_location=device)

		# KoGPT-2 언어 모델 학습을 위한 GPT2LMHeadModel 선언
		kogpt2model = GPT2LMHeadModel(config=GPT2Config.from_dict(kogpt2_config))
		kogpt2model.load_state_dict(checkpoint['model_state_dict'])

		kogpt2model.eval()
	except:
		print("count 0 : ", load_path)
	else:
		print("count check : ",re.findall("\d+", load_path))
		count = max([int(i) for i in (re.findall("\d+", load_path))])

	if new:
		count = 0
	# 추가로 학습하기 위해 .train() 사용
	kogpt2model.train()
	vocab_b_obj = gluonnlp.vocab.BERTVocab.from_sentencepiece(vocab_path,
								mask_token=None,
								sep_token=None,
								cls_token=None,
								unknown_token='<unk>',
								padding_token='<pad>',
								bos_token='<s>',
								eos_token='</s>')

	tok_path = get_tokenizer()
	model, vocab = kogpt2model, vocab_b_obj
	sentencepieceTokenizer = SentencepieceTokenizer(tok_path)

	dataset = Read_Dataset(data_file_path, vocab, sentencepieceTokenizer)
	data_loader = DataLoader(dataset, batch_size=batch_size, shuffle=True, pin_memory=True)

	learning_rate = 3e-5
	criterion = torch.nn.CrossEntropyLoss()
	optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

	## train
	# vocab.token_to_idx["\n"] = vocab.token_to_idx["<unused0>"]
	# del vocab.token_to_idx["<unused0>"]
	# vocab.token_to_idx["<|endoftext|>"] = vocab.token_to_idx["<unused1>"]
	# del vocab.token_to_idx["<unused1>"]

	model = model.to(ctx)
	tok = SentencepieceTokenizer(tok_path)

	print('KoGPT-2 Transfer Learning Start')
	avg_loss = (0.0, 0.0)
	for epoch in range(epoch):
		for data in data_loader:
			optimizer.zero_grad()
			data = torch.stack(data) # list of Tensor로 구성되어 있기 때문에 list를 stack을 통해 변환해준다.
			data = data.transpose(1,0)
			data = data.to(ctx)
			model = model.to(ctx)

			outputs = model(data, labels=data)
			loss, logits = outputs[:2]
			loss = loss.to(ctx)
			loss.backward()
			avg_loss = (avg_loss[0] * 0.99 + loss, avg_loss[1] * 0.99 + 1.0)
			optimizer.step()

			if count % 10 == 0:
				print('epoch no.{0} train no.{1}  loss = {2:.5f} avg_loss = {3:.5f}' . format(epoch, count, loss, avg_loss[0] / avg_loss[1]))
				summary.add_scalar('loss/avg_loss', avg_loss[0] / avg_loss[1], count)
				summary.add_scalar('loss/loss', loss, count)
				# print("save")
				# torch.save({
				# 	'epoch': epoch,
				# 	'train_no': count,
				# 	'model_state_dict': model.state_dict(),
				# 	'optimizer_state_dict': optimizer.state_dict(),
				# 	'loss': loss
				# }, save_path + 'KoGPT2_checkpoint_' + str(count) + '.tar')

				#generator 진행
				if (count > 0 and count % 1000 == 0) or (len(data) < batch_size):
					sent = sample_sequence(model.to("cpu"), tok, vocab, sent="성실", text_size=text_size, temperature=0.7, top_p=0.8, top_k=40)
					sent = sent.replace("<unused0>", "\n") # 비효율적이지만 엔터를 위해서 등장
					sent = auto_enter(sent)
					print(sent)
					summary.add_text('Text', sent, count)
					del sent
					pass

			#########################################
			if (count > 0 and count % 18500 == 0):
				# 모델 저장
				try:
					torch.save({
						'epoch': epoch,
						'train_no': count,
						'model_state_dict': model.state_dict(),
						'optimizer_state_dict': optimizer.state_dict(),
						'loss': loss
					}, save_path + 'KoGPT2_checkpoint_' + str(count) + '.tar')
				except:
					pass
			count += 1
예제 #4
0
def main(epoch = 200, save_path = './checkpoint/', load_path = './checkpoint/KoGPT2_checkpoint_long.tar',
		 data_file_path = 'dataset/lyrics_dataset.txt',
		 batch_size = 8, summary_url = 'runs/', new = 0, text_size = 100):
	ctx = 'cuda'
	cachedir = '~/kogpt2/'
	summary = SummaryWriter(summary_url)

	pytorch_kogpt2 = {
		'url': 'https://kobert.blob.core.windows.net/models/kogpt2/pytorch/pytorch_kogpt2_676e9bcfa7.params',
		'fname': 'pytorch_kogpt2_676e9bcfa7.params',
		'chksum': '676e9bcfa7'
	}
	kogpt2_config = {
		"initializer_range": 0.02,
		"layer_norm_epsilon": 1e-05,
		"n_ctx": 1024,
		"n_embd": 768,
		"n_head": 12,
		"n_layer": 12,
		"n_positions": 1024,
		"vocab_size": 50000
	}

	# download model
	model_info = pytorch_kogpt2
	model_path = download(model_info['url'],
						   model_info['fname'],
						   model_info['chksum'],
						   cachedir=cachedir)
	# download vocab
	vocab_info = tokenizer
	vocab_path = download(vocab_info['url'],
						   vocab_info['fname'],
						   vocab_info['chksum'],
						   cachedir=cachedir)

	# KoGPT-2 언어 모델 학습을 위한 GPT2LMHeadModel 선언
	kogpt2model = GPT2LMHeadModel(config=GPT2Config.from_dict(kogpt2_config))

	# model_path 로부터 다운로드 받은 내용을 load_state_dict 으로 업로드
	# 기본 모델에서 가져오는 파라미터 업데이트
	kogpt2model.load_state_dict(torch.load(model_path))

	device = torch.device(ctx) #GPU
	kogpt2model.to(device)
	count = 0

	# 체크포인트에서 불러오기 부분
	try:
		checkpoint = torch.load(load_path, map_location=device)

		# KoGPT-2 언어 모델 학습을 위한 GPT2LMHeadModel 선언
		kogpt2model = GPT2LMHeadModel(config=GPT2Config.from_dict(kogpt2_config))
		kogpt2model.load_state_dict(checkpoint['model_state_dict'])

		kogpt2model.eval()
	except:
		print("count 0 : ", load_path)
	else:
		print("count check : ",re.findall("\d+", load_path))
		count = max([int(i) for i in (re.findall("\d+", load_path))])

	if new:
		count = 0

	# 추가로 학습하기 위해 .train() 사용
	kogpt2model.train()
	vocab_b_obj = gluonnlp.vocab.BERTVocab.from_sentencepiece(vocab_path,
														 mask_token=None,
														 sep_token=None,
														 cls_token=None,
														 unknown_token='<unk>',
														 padding_token='<pad>',
														 bos_token='<s>',
														 eos_token='</s>')

	tok_path = get_tokenizer()
	model, vocab = kogpt2model, vocab_b_obj
	sentencepieceTokenizer = SentencepieceTokenizer(tok_path)

	# 우리의 데이터셋 불러오는 부분
	dataset = Read_Dataset(data_file_path, vocab, sentencepieceTokenizer)
	data_loader = DataLoader(dataset, batch_size=batch_size, shuffle=True, pin_memory=True)

	# 체크
	learning_rate = 3e-5
	criterion = torch.nn.CrossEntropyLoss()
	optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

	model = model.to(ctx)
	# bpe로 할 때 나누고 합치고 하는 과정이 편해짐.
	tok = SentencepieceTokenizer(tok_path)

	print('KoGPT-2 Transfer Learning Start')

	# 장르별로 체크포인트 폴더 없으면 생성하기
	try:
		if not(os.path.isdir(save_path + data_file_path.split("/")[-1][:-4])):
			os.makedirs(os.path.join(save_path + data_file_path.split("/")[-1][:-4]))
	except OSError as e:
		if e.errno != errno.EEXIST:
			print("Failed to create directory!!!!!")
			raise
	
	avg_loss = (0.0, 0.0)
	for epoch in range(epoch):
		# 데이터셋 가져와서 학습 시작
		for datas in data_loader:
			data = datas[0]

			optimizer.zero_grad()
			data = torch.stack(data) # list of Tensor로 구성되어 있기 때문에 list를 stack을 통해 변환해준다.
			data = data.transpose(1,0)
			data = data.to(ctx)
			model = model.to(ctx)

			# 실제 학습
			outputs = model(data, labels=data)
			loss, logits = outputs[:2]

			nowloss = copy.copy(loss)
			# 평균 loss 만들기 avg_loss[0] / avg_loss[1] <- loss 정규화
			avg_loss = (avg_loss[0] * 0.99 + loss, avg_loss[1] * 0.99 + 1.0)

			loss *= datas[2][0] # 특별 socre 부분

			loss = loss.to(ctx)
			loss.backward()

			# 학습 끝
			optimizer.step()

			if count % 10 == 0:
				print('epoch no.{0} train no.{1}  loss = {2:.5f} avg_loss = {3:.5f}' . format(epoch, count, loss, avg_loss[0] / avg_loss[1]))
				summary.add_scalar('loss/avg_loss', avg_loss[0] / avg_loss[1], count)
				summary.add_scalar('loss/loss', loss, count)
				# print("save")
				# torch.save({
				# 	'epoch': epoch,
				# 	'train_no': count,
				# 	'model_state_dict': model.state_dict(),
				# 	'optimizer_state_dict': optimizer.state_dict(),
				# 	'loss': loss
				# }, save_path + 'KoGPT2_checkpoint_' + str(count) + '.tar')

				#generator 진행
				if (count > 0 and count % 2500 == 0):
					sent = sample_sequence(model.to("cpu"), tok, vocab, sent="가", text_size=text_size, temperature=0.7, top_p=0.9, top_k=100)
					sent = sent.replace("//", "\n") # 비효율적이지만 엔터를 위해서 등장
					sent = auto_enter(sent)
					print(sent)
					summary.add_text('Text', sent, count)
					del sent
					pass

			#########################################
			if (count > 0 and count % 10000 == 0):
				print("모델을 저장합니다.")
				# 모델 저장
				try:
					torch.save({
						'epoch': epoch,
						'train_no': count,
						'model_state_dict': model.state_dict(),
						'optimizer_state_dict': optimizer.state_dict(),
						'loss': loss
					}, save_path + data_file_path.split("/")[-1][:-4] + '/' + 'KoGPT2_checkpoint_' + str(count) + '.tar')
				except:
					pass

			if avg_loss[0] / avg_loss[1] < 1.0:
				print("학습완료")
				print("모델저장")
				# 모델 저장
				#try:
				torch.save({
					'epoch': epoch,
					'train_no': count,
					'model_state_dict': model.state_dict(),
					'optimizer_state_dict': optimizer.state_dict(),
					'loss': loss
				}, save_path + data_file_path.split("/")[-1][:-4] + '/' + 'KoGPT2_checkpoint_' + str(count) + '.tar')

				return

			count += 1