def chat(kogptqa, sent='0'): tok_path = get_tokenizer() _, vocab = get_pytorch_kogpt2_model() tok = SentencepieceTokenizer(tok_path, num_best=0, alpha=0) sent_tokens = tok(sent) with torch.no_grad(): while 1: q = input('user > ').strip() if q == 'quit': break q_tok = tok(q) a = '' a_tok = [] while 1: input_ids = torch.LongTensor([ vocab[U_TKN]] + vocab[q_tok] + vocab[EOS, SENT] + vocab[sent_tokens] + vocab[EOS, S_TKN] + vocab[a_tok]).unsqueeze(dim=0) pred = kogptqa(input_ids) gen = vocab.to_tokens( torch.argmax( pred, dim=-1).squeeze().numpy().tolist())[-1] if gen == EOS: break a += gen.replace('▁', ' ') a_tok = tok(a) print("Simsimi > {}".format(a.strip()))
def Tokenizer(item): item = list(np.array(item.tolist())) max = 0 tok_path = get_tokenizer() model, vocab = get_pytorch_kogpt2_model() tok = SentencepieceTokenizer(tok_path, num_best=0, alpha=0) out = [] for i in item: toked = tok(i) input_ids = torch.tensor([ vocab[vocab.bos_token], ] + vocab[toked]).unsqueeze(0) size = input_ids.shape # print(input_ids) # print(input_ids.shape) y = torch.cat( [input_ids, torch.empty(1, max_seqlen - size[1])], axis=1) out = torch.cat([out, y], axis=0) print(out.shape) x_np = out.numpy() x_df = pd.DataFrame(x_np) x_df.to_csv('./data/encoded.csv', mode='w')
def __init__(self, hparams, **kwargs): super(KoGPT2Chat, self).__init__() self.hparams = hparams self.tok_path = get_tokenizer() self.neg = -1e18 self.kogpt2, self.vocab = get_pytorch_kogpt2_model() self.loss_function = torch.nn.CrossEntropyLoss(reduction='none')
def __init__(self, args, **kwargs): super(KoGPT2Chat, self).__init__() self.hparams = args # TK TODO self.hparams.max_len = 1024 self.kogpt2, self.vocab = get_pytorch_kogpt2_model() self.loss_function = torch.nn.CrossEntropyLoss(reduction='none')
def __init__(self, hparams, **kwargs): super(KoGPT2Chat, self).__init__() self.hparams = hparams self.tok_path = get_tokenizer() self.neg = -1e18 self.kogpt2, self.vocab = get_pytorch_kogpt2_model("cuda") self.loss_function = torch.nn.CrossEntropyLoss(reduction='none') self.max_gpu_load_train = 0 self.max_memory_used_train = 0.0
def __init__(self, max_len=32, batch_size=64, lr=5e-5, num_epochs=1): super(KoGPT2Chat, self).__init__() self.batch_size = batch_size self.lr = lr self.max_len = max_len self.tok_path = get_tokenizer() self.num_epochs = num_epochs self.neg = -1e18 self.kogpt2, self.vocab = get_pytorch_kogpt2_model() self.loss_function = torch.nn.CrossEntropyLoss(reduction='none')
def __init__(self, hparams, **kwargs): super(KoGPT2Chat, self).__init__() self.hparams = hparams # hparams에 args정보 들어감 self.tok_path = get_tokenizer() self.neg = -1e18 self.kogpt2, self.vocab = get_pytorch_kogpt2_model( ) # 모델이랑 단어 사전 두개로 받아준다 self.loss_function = torch.nn.CrossEntropyLoss( reduction='none' ) # 손실함수는 CrossEntropyLoss : 분류 모델(label(정답값)과 gpt2의 아웃풋(원핫인코딩))
def __init__(self): self.PAD_IDX = 0 self.UNK_IDX = 1 self.PAD_TOKEN = 'PAD_TOKEN' self.UNK_TOKEN = 'UNK_TOKEN' self.tok=Mecab() _, self.vocab = get_pytorch_kogpt2_model() self.tok_path = get_tokenizer() self.tok2 = SentencepieceTokenizer(self.tok_path, num_best=0, alpha=0)
def __init__(self, examples): super(Dataset, self).__init__() self.examples = examples self.training = False self.PAD_IDX = 0 self.UNK_IDX = 1 self.PAD_TOKEN = 'PAD_TOKEN' self.UNK_TOKEN = 'UNK_TOKEN' self.tok = Mecab() _, self.vocab = get_pytorch_kogpt2_model()
count += 1 randomNum = random.randint(0, count) return vocab.to_tokens(idx[randomNum]) def top_k(lists, vocab, k): item, idx = torch.sort(lists, descending=True) randomNum = random.randint(0, k) idx = idx.tolist() return vocab.to_tokens(idx[randomNum]) device = torch.device('cpu') ck = 'mediumcheck.tar' tok_path = get_tokenizer() model, vocab = get_pytorch_kogpt2_model(ctx='cpu') checkpoint = torch.load(ck, map_location=device) model.load_state_dict(checkpoint['model_state_dict']) tok = SentencepieceTokenizer(tok_path) print(vocab[vocab.bos_token]) print(vocab.bos_token) sent_p = '세계 200여 개 기업이 ' sent_k = '세계 200여 개 기업이 ' sent_argmax = '세계 200여 개 기업이 ' toked_p = tok(sent_p) toked_k = tok(sent_k) toked_argmax = tok(sent_argmax) sent_cnt = 0 input_ids_p = torch.tensor([ vocab[vocab.bos_token],
def main(args): # toker = GPT2Tokenizer.from_pretrained('gpt2') tok_path = get_tokenizer() toker = SentencepieceTokenizer(tok_path) _, vocab = get_pytorch_kogpt2_model() attrs = [] if args.reverse: attrs.append('reverse') if args.two_turn: attrs.append('2turn') if attrs: db_path = (f'{args.corpus[:-4]}.{args.max_seq_len}len.' f'{".".join(attrs)}.db/db') else: db_path = f'{args.corpus[:-4]}.{args.max_seq_len}len.db/db' if exists(dirname(db_path)): raise ValueError('Found existing DB, please backup') else: os.makedirs(dirname(db_path)) with shelve.open(db_path, 'n') as db: # reader = open(args.corpus, "r", encoding="utf-8") reader = pd.read_csv(args.corpus, sep='\t', header=None) chunk = [] n_chunk = 0 n_example = 0 # print("pdb-attach") # from pdb_clone import pdb # rsock = pdb.set_trace_remote() # # if rsock.state != rsock.ST_CONNECTED: # input() for _, line in tqdm(reader.iterrows(), total=len(reader.index)): try: if len(chunk) >= args.chunk_size: # save and renew chunk db[f'chunk_{n_chunk}'] = gzip.compress( json.dumps(chunk[:args.chunk_size]).encode('utf-8')) chunk = chunk[args.chunk_size:] n_chunk += 1 weights, inputs = _get_inputs_from_text(line, toker, vocab) if args.reverse: weights = list(reversed(weights)) inputs = list(reversed(inputs)) if args.two_turn: weights = weights[:2] inputs = inputs[:2] if len(weights) < 2: continue features = _make_features(n_example, weights, inputs, toker, vocab, args.max_seq_len) for feature in features: chunk.append(vars(feature)) n_example += 1 except Exception as e: print('!!! prepro exception !!!', e) continue # save last chunk db[f'chunk_{n_chunk}'] = gzip.compress( json.dumps(chunk).encode('utf-8')) # save relevant information to reproduce meta = { 'n_example': n_example, 'chunk_size': args.chunk_size, 'max_seq_len': args.max_seq_len, 'reverse': args.reverse, 'two_turn': args.two_turn } with open(join(dirname(db_path), 'meta.json'), 'w') as writer: json.dump(meta, writer, indent=4)
def __init__(self, hparams, **kwargs): super(KoGPT2Chat, self).__init__() self.hparams = hparams self.neg = -1e18 self.kogpt2, self.vocab = get_pytorch_kogpt2_model()
def main(args): tok_path = get_tokenizer() model, vocab = get_pytorch_kogpt2_model() tok = SentencepieceTokenizer(tok_path, num_best=0, alpha=0) device = "cuda" if torch.cuda.is_available() else "cpu" batch_size = args.batch_size epochs = args.n_epochs learning_rate = 3e-5 wamup_steps = 2000 max_seq_len = 1024 print("Dataset Loading... ", end=" ") dataset = synoDataset("./data/korean_naver_2.csv", vocab, tok) data_loader = DataLoader(dataset, batch_size=1, shuffle=False) print("[[[Done]]]") model = model.to(device) model.train() optimizer = AdamW(model.parameters(), lr=learning_rate) scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=wamup_steps, num_training_steps=-1) proc_seq_count = 0 sum_loss = 0.0 batch_count = 0 model.zero_grad() models_folder = "trained_models" if not os.path.exists(models_folder): os.mkdir(models_folder) for epoch in range(epochs): print(f"Epoch {epoch} started" + "=" * 30) for idx, syno in enumerate(data_loader): # """ max 시퀀스가 넘으면 슬라이싱 """ if len(syno) > max_seq_len: syno = syno[:max_seq_len] syno_tensor = torch.tensor(syno).unsqueeze(0).to(device) outputs = model(syno_tensor, labels=syno_tensor) loss, logits = outputs[:2] loss.backward() sum_loss = sum_loss + loss.detach().data proc_seq_count = proc_seq_count + 1 if proc_seq_count == batch_size: proc_seq_count = 0 batch_count += 1 optimizer.step() scheduler.step() optimizer.zero_grad() model.zero_grad() if batch_count == args.print_every: print( f"average loss for 100 epoch {sum_loss // args.print_every}" ) batch_count = 0 sum_loss = 0.0 # Store the model after each epoch to compare the performance of them if epoch % args.save_every == 0: torch.save( model.state_dict(), os.path.join(args.save_dir, f"gpt2_genre_pad_{epoch}.pt"), )
def __init__(self, **kwargs): super(KoGPT2Chat, self).__init__() self.neg = -1e18 self.kogpt2, self.vocab = get_pytorch_kogpt2_model() self._tok_path = get_tokenizer() self.previous_context = [[]]
import os import torch import platform import sentencepiece from kogpt2.utils import get_tokenizer from kogpt2.pytorch_kogpt2 import get_pytorch_kogpt2_model from flask import Flask, request, jsonify, __version__ as flaskver tok_path = get_tokenizer(cachedir='./bin/') model, vocab = get_pytorch_kogpt2_model(cachedir='./bin/') tok = sentencepiece.SentencePieceProcessor(tok_path) app = Flask(__name__) port = int(os.getenv('port', '8080')) @app.route('/', methods=['GET']) def root(): env = { 'python': platform.python_version(), 'flask': flaskver, 'pytorch': torch.__version__ } urls = { 'original': 'https://github.com/SKT-AI/KoGPT2', 'fork': 'https://github.com/pmh-only/KoGPT2' } usage = 'GET /job?query=<sentence>[&loop=<loopLimit>]' return jsonify(label='kogpt2', urls=urls, env=env, usage=usage)
from kogpt2.model.torch_gpt2 import GPT2LMHeadModel from kogpt2.configuration_gpt2 import GPT2Config kogpt2_config = { "initializer_range": 0.02, "layer_norm_epsilon": 1e-05, "n_ctx": 1024, "n_embd": 768, "n_head": 12, "n_layer": 12, "n_positions": 1024, "vocab_size": 50000, } tok_path = get_tokenizer() model, vocab = get_pytorch_kogpt2_model() tok = SentencepieceTokenizer(tok_path, num_best=0, alpha=0) device = "cpu" if torch.cuda.is_available(): device = torch.device("cuda:2") torch.cuda.device("cuda:2") print(device) org_path = "trained_models/gpt2_j20_1007.pt" load_path = "trained_models/gpt2_genre_pad_50.pt" checkpoint = torch.load(load_path, map_location=device) # 1013: special token 학습한 뒤로 keys 값이 달라져서 이와 같은 작업 필요 checkpoint_org = torch.load(org_path, map_location=device) ckpt_final = { k: v
def train(): parser = ArgumentParser() parser.add_argument("--dataset_path", type=str, default="", help="Path or url of the dataset.") parser.add_argument("--use_adapter", default=False, action='store_true', help="Use adapter or not") parser.add_argument("--keyword_module", type=str, default="", help="add, attention, ") parser.add_argument("--model_checkpoint", type=str, default="bertGpt", help="Path, url or short name of the model") parser.add_argument("--train_batch_size", type=int, default=4, help="Batch size for training") parser.add_argument("--valid_batch_size", type=int, default=4, help="Batch size for validation") parser.add_argument("--gradient_accumulation_steps", type=int, default=8, help="Accumulate gradients on several steps") parser.add_argument("--lr", type=float, default=6.25e-5, help="Learning rate") parser.add_argument("--max_norm", type=float, default=1.0, help="Clipping gradient norm") parser.add_argument("--n_epochs", type=int, default=3, help="Number of training epochs") parser.add_argument( "--eval_before_start", action='store_true', help="If true start with a first evaluation before training") parser.add_argument("--device", type=str, default="cuda" if torch.cuda.is_available() else "cpu", help="Device (cuda or cpu)") parser.add_argument( "--fp16", type=str, default="", help= "Set to O0, O1, O2 or O3 for fp16 training (see apex documentation)") parser.add_argument( "--local_rank", type=int, default=-1, help="Local rank for distributed training (-1: not distributed)") parser.add_argument("--bert_model_path", default="./", type=str, help="Bert pre-trained model path") parser.add_argument( "--vocab_file", default="./vocab.korean.rawtext.list", type=str, help="The vocabulary file that the BERT model was trained on.") parser.add_argument( "--do_lower_case", action='store_true', help="Set this flag if you are using an uncased model.") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") args = parser.parse_args() # logging is set to INFO (resp. WARN) for main (resp. auxiliary) process. logger.info => log main process only, logger.warning => log all processes logging.basicConfig( format='%(asctime)s - %(levelname)s - %(name)s - %(message)s', datefmt='%m/%d/%Y %H:%M:%S', level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN) logger.warning( "Running process %d", args.local_rank ) # This is a logger.warning: it will be printed by all distributed processes logger.info("Arguments: %s", pformat(args)) # Initialize distributed training if needed args.distributed = (args.local_rank != -1) if args.distributed: torch.cuda.set_device(args.local_rank) args.device = torch.device("cuda", args.local_rank) torch.distributed.init_process_group(backend='nccl', init_method='env://') logger.info("Prepare tokenizer, pretrained model and optimizer.") #tokenizer_class = GPT2Tokenizer if "gpt2" in args.model_checkpoint else OpenAIGPTTokenizer # cant use Autotokenizer because checkpoint could be a Path #tokenizer = tokenizer_class.from_pretrained(args.model_checkpoint) random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) # Load KoBERT model and tokenizer bert_tokenizer = BertTokenizer.from_pretrained( args.vocab_file, do_lower_case=args.do_lower_case) bert_model = BertModel.from_pretrained(args.bert_model_path) bert_model.to(args.device) # Load KoGPT2 model and tokenizer tok_path = get_tokenizer() gpt_model, gpt_vocab = get_pytorch_kogpt2_model( keyword_module=args.keyword_module, use_adapter=args.use_adapter) gpt_tokenizer = SentencepieceTokenizer(tok_path) gpt_model.to(args.device) model = Seq2Seq(bert_model, gpt_model, gpt_vocab, args) optimizer = AdamW(model.parameters(), lr=args.lr, correct_bias=True) # Prepare model for FP16 and distributed training if needed (order is important, distributed should be the last) #if args.fp16: #from apex import amp # Apex is only required if we use fp16 training #model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16) if args.distributed: model = DistributedDataParallel(model, device_ids=[args.local_rank], output_device=args.local_rank) logger.info("Prepare datasets") train_loader, val_loader, train_sampler, valid_sampler = get_data_loaders( args, bert_tokenizer, gpt_tokenizer, gpt_vocab) # Training function and trainer def update(engine, batch): model.train() batch = tuple(input_tensor.to(args.device) for input_tensor in batch) source_ids, target_ids, lm_labels, keyword_scores = batch #(lm_loss), *_ = model(input_ids, token_type_ids=token_type_ids, labels=lm_labels) (lm_loss), *_ = model(source_ids, target_ids, key_score=keyword_scores, lm_labels=lm_labels) loss = lm_loss / args.gradient_accumulation_steps if args.fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_norm) else: loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_norm) if engine.state.iteration % args.gradient_accumulation_steps == 0: optimizer.step() optimizer.zero_grad() return loss.item() trainer = Engine(update) # Evaluation function and evaluator (evaluator output is the input of the metrics) def inference(engine, batch): model.eval() with torch.no_grad(): batch = tuple( input_tensor.to(args.device) for input_tensor in batch) source_ids, target_ids, lm_labels, keyword_scores = batch #lm_logits, *_ = model(input_ids, token_type_ids=token_type_ids,) lm_logits, *_ = model(source_ids, target_ids, key_score=keyword_scores) lm_logits_flat_shifted = lm_logits[..., :-1, :].contiguous().view( -1, lm_logits.size(-1)) lm_labels_flat_shifted = lm_labels[..., 1:].contiguous().view(-1) return (lm_logits_flat_shifted), (lm_labels_flat_shifted) evaluator = Engine(inference) # Attach evaluation to trainer: we evaluate when we start the training and at the end of each epoch trainer.add_event_handler(Events.EPOCH_COMPLETED, lambda _: evaluator.run(val_loader)) if args.n_epochs < 1: trainer.add_event_handler(Events.COMPLETED, lambda _: evaluator.run(val_loader)) if args.eval_before_start: trainer.add_event_handler(Events.STARTED, lambda _: evaluator.run(val_loader)) # Make sure distributed data samplers split the dataset nicely between the distributed processes if args.distributed: trainer.add_event_handler( Events.EPOCH_STARTED, lambda engine: train_sampler.set_epoch(engine.state.epoch)) evaluator.add_event_handler( Events.EPOCH_STARTED, lambda engine: valid_sampler.set_epoch(engine.state.epoch)) # Linearly decrease the learning rate from lr to zero scheduler = PiecewiseLinear(optimizer, "lr", [(0, args.lr), (args.n_epochs * len(train_loader), 0.0)]) trainer.add_event_handler(Events.ITERATION_STARTED, scheduler) # Prepare metrics - note how we compute distributed metrics RunningAverage(output_transform=lambda x: x).attach(trainer, "loss") metrics = { "nll": Loss(torch.nn.CrossEntropyLoss(ignore_index=-100), output_transform=lambda x: (x[0], x[1])) } metrics.update({ "average_nll": MetricsLambda(average_distributed_scalar, metrics["nll"], args) }) metrics["average_ppl"] = MetricsLambda(math.exp, metrics["average_nll"]) for name, metric in metrics.items(): metric.attach(evaluator, name) # On the main process: add progress bar, tensorboard, checkpoints and save model, configuration and tokenizer before we start to train if args.local_rank in [-1, 0]: pbar = ProgressBar(persist=True) pbar.attach(trainer, metric_names=["loss"]) evaluator.add_event_handler( Events.COMPLETED, lambda _: pbar.log_message( "Validation: %s" % pformat(evaluator.state.metrics))) log_dir = make_logdir(args.model_checkpoint, args.dataset_path, args.use_adapter, args.keyword_module) tb_logger = TensorboardLogger(log_dir) tb_logger.attach(trainer, log_handler=OutputHandler(tag="training", metric_names=["loss"]), event_name=Events.ITERATION_COMPLETED) tb_logger.attach(trainer, log_handler=OptimizerParamsHandler(optimizer), event_name=Events.ITERATION_STARTED) tb_logger.attach(evaluator, log_handler=OutputHandler(tag="validation", metric_names=list( metrics.keys()), another_engine=trainer), event_name=Events.EPOCH_COMPLETED) checkpoint_handler = ModelCheckpoint(log_dir, 'checkpoint', save_interval=1, n_saved=2) trainer.add_event_handler( Events.EPOCH_COMPLETED, checkpoint_handler, {'mymodel': model }) # "getattr" takes care of distributed encapsulation torch.save(args, log_dir + '/model_training_args.bin') #getattr(model, 'module', model).config.to_json_file(os.path.join(log_dir, CONFIG_NAME)) #tokenizer.save_pretrained(log_dir) # Run the training trainer.run(train_loader, max_epochs=args.n_epochs) # On the main process: close tensorboard logger and rename the last checkpoint (for easy re-loading with OpenAIGPTModel.from_pretrained method) if args.local_rank in [-1, 0] and args.n_epochs > 0: os.rename( os.path.join(log_dir, checkpoint_handler._saved[-1][1]), os.path.join(log_dir, WEIGHTS_NAME) ) # TODO: PR in ignite to have better access to saved file paths (cleaner) tb_logger.close()