def model_fn(model_dir): logger.info('Loading the model.') vocab_file_path = os.path.join(model_dir, 'vocab.json') merge_file_path = os.path.join(model_dir, 'merges.txt') model_file_path = os.path.join(model_dir, 'lyric_model.bin') tokenizer = MyTokenizer(vocab_file_path, merge_file_path) bos = tokenizer.convert_tokens_to_ids('<s>') eos = tokenizer.convert_tokens_to_ids('</s>') pad = tokenizer.convert_tokens_to_ids('<pad>') unk = tokenizer.convert_tokens_to_ids('<unk>') config = GPT2Config(vocab_size=52003, resid_pdrop=0, embd_pdrop=0, attn_pdrop=0, summary_first_dropout=0) model = GPT2LMHeadModel(config) model.load_state_dict(torch.load(model_file_path, map_location=device), strict=False) model.to(device) return model, tokenizer
from transformers import GPT2LMHeadModel, GPT2Config, AdamW from new_tokenizer import MyTokenizer import torch from torch.utils.data import Dataset from torch.utils.data import DataLoader import json vocab_file_path = '../tokenizer/vocab.json' merge_file_path = '../tokenizer/merges.txt' tokenizer = MyTokenizer(vocab_file_path, merge_file_path) config = GPT2Config(vocab_size=52000) model = GPT2LMHeadModel(config) model_dir = '../model/pytorch_model.bin' model.load_state_dict(torch.load(model_dir), strict=False) model.to('cuda') ATTR_TO_SPECIAL_TOKEN = ['<social>', '<economy>', '<world>', '<science>', '<sports>', '<politics>', '<entertainment>', '<it>', '<title>', '</title>'] category_map = {'사회':'<social>', '경제':'<economy>', '세계':'<world>', 'IT/과학':'<science>', '스포츠':'<sports>', '정치':'<politics>', '연예':'<entertainment>', 'IT':'<it>'} def add_special_tokens_(model, tokenizer): orig_num_tokens = tokenizer.get_vocab_size() tokenizer.add_special_tokens(ATTR_TO_SPECIAL_TOKEN) num_added_tokens = len(ATTR_TO_SPECIAL_TOKEN) model.resize_token_embeddings(new_num_tokens=orig_num_tokens + num_added_tokens + 1) add_special_tokens_(model, tokenizer) b_title = tokenizer.convert_tokens_to_ids('<title>') e_title = tokenizer.convert_tokens_to_ids('</title>')
from transformers import GPT2LMHeadModel, GPT2Config from new_tokenizer import MyTokenizer import torch import kss vocab_file_path = '../tokenizer/vocab.json' merge_file_path = '../tokenizer/merges.txt' answer_tokenizer = MyTokenizer(vocab_file_path, merge_file_path) question_tokenizer = MyTokenizer(vocab_file_path, merge_file_path) answer_config = GPT2Config(vocab_size=52004) question_config = GPT2Config(vocab_size=52005) answer_model = GPT2LMHeadModel(answer_config) question_model = GPT2LMHeadModel(question_config) answer_model_dir = '../KorGPT-2SampleModel/answer_model.bin' question_model_dir = '../KorGPT-2SampleModel/question_model.bin' answer_model.load_state_dict(torch.load(answer_model_dir), strict=False) question_model.load_state_dict(torch.load(question_model_dir), strict=False) answer_model.to('cpu') question_model.to('cpu') def add_special_tokens_(model, tokenizer, added_tokens): orig_num_tokens = tokenizer.get_vocab_size() tokenizer.add_special_tokens(added_tokens) added_answer_tokens = ['<answer>', '<sep>', '</answer>']
] category_map = { '사회': '<social>', '경제': '<economy>', '세계': '<world>', 'IT/과학': '<science>', '스포츠': '<sports>', '정치': '<politics>', '연예': '<entertainment>', 'IT': '<it>' } vocab_file_path = '../tokenizer/vocab.json' merge_file_path = '../tokenizer/merges.txt' tokenizer = MyTokenizer(vocab_file_path, merge_file_path) bos = tokenizer.convert_tokens_to_ids('<s>') eos = tokenizer.convert_tokens_to_ids('</s>') pad = tokenizer.convert_tokens_to_ids('<pad>') unk = tokenizer.convert_tokens_to_ids('<unk>') config = GPT2Config(vocab_size=52011, resid_pdrop=0, embd_pdrop=0, attn_pdrop=0, summary_first_dropout=0) model = GPT2LMHeadModel(config) # model_dir = '../KorGPT-2SampleModel/lyric_model.bin' model_dir = '../model/summary_model.bin'
import torch from torch.nn.parallel import DistributedDataParallel from torch.utils.data import DataLoader, TensorDataset from ignite.engine import Engine, Events from ignite.handlers import ModelCheckpoint from ignite.metrics import Accuracy, Loss, MetricsLambda, RunningAverage from ignite.contrib.handlers import ProgressBar, PiecewiseLinear from ignite.contrib.handlers.tensorboard_logger import TensorboardLogger, OutputHandler, OptimizerParamsHandler from transformers import (AdamW, GPT2DoubleHeadsModel, GPT2Config, WEIGHTS_NAME, CONFIG_NAME, cached_path) from new_tokenizer import MyTokenizer #import time vocab_file_path = 'model/kogpt2_news_wiki_ko_cased_818bfa919d.spiece' tokenizer = MyTokenizer(vocab_file_path) SPECIAL_TOKENS = ["<s>", "</s>", "<sent1>", "<sent2>", "<pad>"] ATTR_TO_SPECIAL_TOKEN = { 'bos_token': '<s>', 'eos_token': '</s>', 'pad_token': '<pad>', 'additional_special_tokens': ['<sent1>', '<sent2>'] } MODEL_INPUTS = [ "input_ids", "mc_token_ids", "lm_labels", "mc_labels", "token_type_ids" ] PADDED_INPUTS = ["input_ids", "lm_labels", "token_type_ids"] logger = logging.getLogger(__file__)
from transformers import GPT2LMHeadModel, GPT2Config, AdamW from new_tokenizer import MyTokenizer import torch from torch.utils.data import Dataset from torch.utils.data import DataLoader import kss vocab_file_path = '../tokenizer/vocab.json' merge_file_path = '../tokenizer/merges.txt' tokenizer = MyTokenizer(vocab_file_path, merge_file_path) config = GPT2Config(vocab_size=52000) model = GPT2LMHeadModel(config) model_dir = '../KorGPT-2SampleModel/pytorch_model.bin' model.load_state_dict(torch.load(model_dir), strict=False) model.to('cuda') ATTR_TO_SPECIAL_TOKEN = ['<answer>', '</answer>', '<question>', '</question>'] def add_special_tokens_(model, tokenizer): orig_num_tokens = tokenizer.get_vocab_size() tokenizer.add_special_tokens(ATTR_TO_SPECIAL_TOKEN) num_added_tokens = len(ATTR_TO_SPECIAL_TOKEN) model.resize_token_embeddings(new_num_tokens=orig_num_tokens + num_added_tokens + 1) add_special_tokens_(model, tokenizer)
from transformers import GPT2LMHeadModel, GPT2Config from new_tokenizer import MyTokenizer import torch ATTR_TO_SPECIAL_TOKEN = ['<song>', '</song>'] vocab_file_path = '../tokenizer/vocab.json' merge_file_path = '../tokenizer/merges.txt' tokenizer = MyTokenizer(vocab_file_path, merge_file_path) bos = tokenizer.convert_tokens_to_ids('<s>') eos = tokenizer.convert_tokens_to_ids('</s>') pad = tokenizer.convert_tokens_to_ids('<pad>') unk = tokenizer.convert_tokens_to_ids('<unk>') config = GPT2Config(vocab_size=52003, resid_pdrop=0, embd_pdrop=0, attn_pdrop=0, summary_first_dropout=0) model = GPT2LMHeadModel(config) model_dir = '../KorGPT-2SampleModel/lyric_model.bin' model.load_state_dict(torch.load(model_dir), strict=False) model.to('cpu') def add_special_tokens_(model, tokenizer): orig_num_tokens = tokenizer.get_vocab_size() tokenizer.add_special_tokens(ATTR_TO_SPECIAL_TOKEN)
from transformers import GPT2LMHeadModel, GPT2Config, AdamW from new_tokenizer import MyTokenizer import torch from torch.utils.data import Dataset from torch.utils.data import DataLoader import kss vocab_file_path = '../tokenizer/vocab.json' merge_file_path = '../tokenizer/merges.txt' tokenizer = MyTokenizer(vocab_file_path, merge_file_path) config = GPT2Config(vocab_size=52000) model = GPT2LMHeadModel(config) model_dir = '../KorGPT-2SampleModel/pytorch_model.bin' model.load_state_dict(torch.load(model_dir), strict=False) model.to('cuda') ATTR_TO_SPECIAL_TOKEN = ['<answer>', '<sep>', '</answer>'] def add_special_tokens_(model, tokenizer): orig_num_tokens = tokenizer.get_vocab_size() tokenizer.add_special_tokens(ATTR_TO_SPECIAL_TOKEN) num_added_tokens = len(ATTR_TO_SPECIAL_TOKEN) model.resize_token_embeddings(new_num_tokens=orig_num_tokens + num_added_tokens + 1) add_special_tokens_(model, tokenizer)
parser.add_argument('--num-hidden-layers', type=int, default=6) parser.add_argument('--type-vocab-size', type=int, default=1) parser.add_argument('--token-max-len', type=int, default=512) # Data and model checkpoints/otput directories from the container environment parser.add_argument('--model-dir', type=str, default=os.environ['SM_MODEL_DIR']) parser.add_argument('--output-data-dir', type=str, default=os.environ['SM_OUTPUT_DATA_DIR']) parser.add_argument('--data-dir', type=str, default=os.environ['SM_CHANNEL_TRAINING']) args = parser.parse_args() vocab_file_path = os.path.join(args.data_dir, 'tokenizer/vocab.json') merge_file_path = os.path.join(args.data_dir,'tokenizer/merges.txt') model_file = os.path.join(args.data_dir,'KorGPT-2SampleModel/pytorch_model.bin') tokenizer = MyTokenizer(vocab_file_path, merge_file_path) tokenizer.save(args.model_dir) # Save it to model dir for generation config = GPT2Config(vocab_size=52000) model = GPT2LMHeadModel(config) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model.load_state_dict(torch.load(model_file, map_location=device), strict=False) model.to("cpu").eval() # Memory get_gpu_memory() ATTR_TO_SPECIAL_TOKEN = ['<song>', '</song>'] def add_special_tokens_(model, tokenizer): orig_num_tokens = tokenizer.get_vocab_size()