def __init__( self, config, class_labels, pretrained_model_path, dropout=0.1, freeze_pretrained_part=True, reinitialize=False, n_layers=6, ): super().__init__(config, class_labels) if reinitialize: logger.info('resetting model weights') config = GPT2Config.from_json_file(pretrained_model_path + '/config.json') config = config.to_dict() config['n_layer'] = n_layers config = GPT2Config.from_dict(config) self.gpt2 = GPT2Model(config) else: self.gpt2 = GPT2Model.from_pretrained(pretrained_model_path) self.dropout = torch.nn.Dropout(dropout) self.fc = torch.nn.Linear(self.gpt2.config.n_embd, self.output_dim) if freeze_pretrained_part: for param in self.gpt2.parameters(): param.requires_grad = False
def __init__(self, max_output_length=25, max_input_length=300, device='cpu', tokenizer_type='gpt2', bpe_model="", starter_model=None): if tokenizer_type == "gpt2": self.tokenizer = utils_tokenizer.GPT2Tokenizer() config = GPT2Config.from_pretrained("gpt2") elif tokenizer_type == "bpecap": self.tokenizer = utils_tokenizer.BPETokenizer(bpe_model) config = GPT2Config.from_dict({ "finetuning_task": None, "initializer_range": 0.02, "layer_norm_epsilon": 1e-05, "n_ctx": 1024, "n_embd": 768, "n_head": 12, "n_layer": 12, "n_positions": 1024, "num_labels": 1, "resid_pdrop": 0.1, "use_bfloat16": False, "vocab_size": self.tokenizer.vocab_size }) else: print("Tokenizer unrecognized. Should be gpt2 or bpecap.") exit() self.model = GPT2LMHeadModel(config) self.model.to(device) self.device = device if starter_model is not None: self.reload(starter_model) self.max_output_length = max_output_length self.max_input_length = max_input_length self.model.train() self.mode = "train"
def korean_gpt_long_setence_life_test(): config = get_config() kogpt2_config = get_kog_config() kogpt2_model_path = "C:\\Users\\multicampus\\s02p23c104\\Back\\AI\\checkpoints\\kogpt_life_model_20_2020-04-26-23-56-31.pth" kogpt2_vocab_path = config['kogpt_vocab_path'] kogpt2model = GPT2LMHeadModel(config=GPT2Config.from_dict(kogpt2_config)) torch.load(kogpt2_model_path) kogpt2model.load_state_dict(torch.load(kogpt2_model_path)) device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") kogpt2model.to(device) kogpt2model.eval() vocab = nlp.vocab.BERTVocab.from_sentencepiece(kogpt2_vocab_path, mask_token=None, sep_token=None, cls_token=None, unknown_token='<unk>', padding_token='<pad>', bos_token='<s>', eos_token='</s>') tok = SentencepieceTokenizer(kogpt2_vocab_path) sent = '나는 밥을 먹었' toked = tok(sent) print(toked) sent_cnt = 0 input_ids = torch.tensor([ vocab[vocab.bos_token], ] + vocab[toked]).unsqueeze(0) input_ids = input_ids.to(device) outputs = kogpt2model.generate(input_ids=input_ids, max_length=100, min_length=50, repetition_penalty=1.2, do_sample=True, num_beams=3, bos_token_id=0, pad_token_id=3, eos_token_id=1, num_return_sequences=3) target = outputs[0] print("========수필===========") for i in range(3): # 3 output sequences were generated toked = vocab.to_tokens(outputs[i].squeeze().tolist()) ret = re.sub(r'(<s>|</s>|<pad>|<unk>)', '', ''.join(toked).replace('▁', ' ').strip()) print('Generated {}: {}'.format(i, ret))
def get_kogpt2_model(model_file, vocab_file, ctx="cpu"): kogpt2model = GPT2LMHeadModel(config=GPT2Config.from_dict(kogpt2_config)) kogpt2model.load_state_dict(torch.load(model_file)) device = torch.device(ctx) kogpt2model.to(device) kogpt2model.eval() vocab_b_obj = nlp.vocab.BERTVocab.from_sentencepiece(vocab_file, mask_token=None, sep_token=None, cls_token=None, unknown_token='<unk>', padding_token='<pad>', bos_token='<s>', eos_token='</s>') return kogpt2model, vocab_b_obj
def load_kogpt2_model_from_checkpoint(kogpt2, load_path, device, ctx='cpu'): try: checkpoint = torch.load(load_path, map_location=device) kogpt2model = GPT2LMHeadModel(config=GPT2Config.from_dict(kogpt2_config)) kogpt2model.load_state_dict(checkpoint['model_state_dict']) kogpt2model.eval() except: count = 0 kogpt2model, _ = load_kogpt2_model() else: count = int(re.findall("\d+", load_path)[1]) print(count) return kogpt2model, count
def fine_tuning(config, fine_tune_num, AI_DIRECTORY): """ Train the model """ device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") batch_size = config['kogpt_batch_size'] train_path = AI_DIRECTORY + config['kogpt_story_train_data_path'] num_train_epochs = config['kogpt_epoch'] kogpt2_config = get_kog_config() kogpt2_model_path = AI_DIRECTORY + config['kogpt_model_path'] kogpt2_vocab_path = AI_DIRECTORY + config['kogpt_vocab_path'] kogpt2model = GPT2LMHeadModel(config=GPT2Config.from_dict(kogpt2_config)) kogpt2model.load_state_dict(torch.load(kogpt2_model_path)) kogpt2model.to(device) vocab = nlp.vocab.BERTVocab.from_sentencepiece(kogpt2_vocab_path, mask_token=None, sep_token=None, cls_token=None, unknown_token='<unk>', padding_token='<pad>', bos_token='<s>', eos_token='</s>') tok = SentencepieceTokenizer(kogpt2_vocab_path) loader = make_kogpt2_loader(train_path, batch_size) num_training_steps = len(loader) * num_train_epochs learning_rate = 5e-6 adam_epsilon = 1e-8 warmup_steps = 0 no_decay = ["bias", "LayerNorm.weight"] #freeze_model(fine_tune_num,kogpt2model) optimizer_grouped_parameters = [ { "params": [ p for n, p in kogpt2model.named_parameters() if not any(nd in n for nd in no_decay) and p.requires_grad == True ], "weight_decay": 0.0, }, { "params": [ p for n, p in kogpt2model.named_parameters() if any(nd in n for nd in no_decay) and p.requires_grad == True ], "weight_decay": 0.0, }, ] optimizer = AdamW(optimizer_grouped_parameters, lr=learning_rate, eps=adam_epsilon) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=warmup_steps, num_training_steps=num_training_steps) global_step = 0 epochs_trained = 0 tr_loss = 0.0 logging_loss = 0.0 kogpt2model.zero_grad() train_iterator = trange(epochs_trained, int(num_train_epochs), desc="Epoch") logging_steps = 500 loss_record = [] for epoch in train_iterator: epoch_iterator = tqdm(loader, desc="Iteration") for step, inputs in enumerate(epoch_iterator): # Skip past any already trained steps if resuming training ############### kogpt2model.train() input = inputs.to(device) label = inputs.to(device) outputs = kogpt2model(input_ids=input, labels=label) loss = outputs[ 0] # model outputs are always tuple in transformers (see doc) loss.backward() tr_loss += loss.item() optimizer.step() scheduler.step() kogpt2model.zero_grad() global_step += 1 if logging_steps > 0 and global_step % logging_steps == 0: logs = {} loss_scalar = (tr_loss - logging_loss) / logging_steps learning_rate_scalar = scheduler.get_last_lr()[0] logs["learning_rate"] = learning_rate_scalar logs["loss"] = loss_scalar loss_record.append(loss_scalar) logging_loss = tr_loss epoch_iterator.write( json.dumps({ **logs, **{ "step": global_step } })) return kogpt2model, loss_record
def get_kogpt2_config(): return GPT2Config.from_dict(kogpt2_config)
def predict(images,root_path,AI_directory_path,model_type="life"): config = get_config() #0. Extract captions from images vocab = load_voca(AI_directory_path+config['caption_vocab_path']) caption_embed_size = config['caption_embed_size'] caption_hidden_layer = config['caption_hidden_layer'] caption_hidden_size = config['caption_hidden_size'] caption_encoder_path = AI_directory_path+config['caption_encoder_path'] caption_decoder_path = AI_directory_path+config['caption_decoder_path'] device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") max_sequence_len = 30 #default value transform = torch_transform.Compose([ torch_transform.ToTensor(), torch_transform.Normalize(mean=(0.4444, 0.4215, 0.3833), std=(0.2738, 0.2664, 0.2766))]) encoder = EncoderCNN(caption_embed_size) decoder = DecoderRNN(caption_embed_size, len(vocab), caption_hidden_layer, caption_hidden_size) encoder.load_state_dict(torch.load(caption_encoder_path,map_location=device)) decoder.load_state_dict(torch.load(caption_decoder_path,map_location=device)) images = load_image(images, root_path, transform) encoder.eval() decoder.eval() encoder.to(device) decoder.to(device) images = images.to(device) features = encoder(images) states = None predicted_index = [] lstm_inputs = features.unsqueeze(1) for i in range(max_sequence_len): outputs,states = decoder.lstm(lstm_inputs,states) # outputs을 linear 레이어의 인풋을 위해 2차원 배열로 만들어 줘야함 outputs = outputs.squeeze(1) scores_per_batch = decoder.score_layer(outputs) values, predicted = scores_per_batch.max(1) predicted_index.append(predicted) lstm_inputs = decoder.embed(predicted) lstm_inputs = lstm_inputs.unsqueeze(1) predicted_index = torch.stack(predicted_index,dim=1) predicted_index = predicted_index.cpu().numpy() result_captions = [] for wordindices in predicted_index: text ="" for index in wordindices: word = vocab.idx2word[index] if word == '<end>': break if word == '<unk>' or word == '<start>': continue text += word + " " result_captions.append(text) print("result_caption : ",result_captions) # 1. translate captions to korean korean_sentences = [] for sent in result_captions: translate_result = get_translate(sent) if translate_result != -1: translate_result = re.sub(r'\.','',translate_result) korean_sentences.append(translate_result) print("result_korean : ",korean_sentences) kogpt2_config = get_kog_config() if model_type == "life": kogpt2_model_path = AI_directory_path+config['kogpt_life_model_path'] elif model_type == "story": kogpt2_model_path = AI_directory_path + config['kogpt_story_model_path'] else: kogpt2_model_path = AI_directory_path+config['kogpt_model_path'] kogpt2_vocab_path = AI_directory_path+config['kogpt_vocab_path'] kogpt2model = GPT2LMHeadModel(config=GPT2Config.from_dict(kogpt2_config)) kogpt2model.load_state_dict(torch.load(kogpt2_model_path,map_location=device)) kogpt2model.to(device) kogpt2model.eval() vocab = nlp.vocab.BERTVocab.from_sentencepiece(kogpt2_vocab_path, mask_token=None, sep_token=None, cls_token=None, unknown_token='<unk>', padding_token='<pad>', bos_token='<s>', eos_token='</s>') tok = SentencepieceTokenizer(kogpt2_vocab_path) korean_preprocess(korean_sentences) gpt_result = naive_prediction(korean_sentences,tok,vocab,device,kogpt2model,model_type) korean_postprocess(gpt_result) result = [] make_sentence(gpt_result,"",result,0) result.sort(key=lambda item: (-len(item),item)) result_len = len(result) if result_len >11: result_len = 11 result = result[1:result_len] return result
save_path = '/kogpt2_article/KoGPT2_checkpoint/' kogpt2_config = { "initializer_range": 0.02, "layer_norm_epsilon": 0.000025, "n_ctx": 1024, "n_embd": 768, "n_head": 12, "n_layer": 12, "n_positions": 1024, "vocab_size": 50000 } checkpoint = torch.load(save_path + 'KoGPT2_checkpoint.tar', map_location=PU) kogpt2model = GPT2LMHeadModel(config=GPT2Config.from_dict(kogpt2_config)) kogpt2model.load_state_dict(checkpoint['model_state_dict']) kogpt2model.eval() kogpt2model.to(torch.device(PU)) model = kogpt2model Tokenizer = SentencepieceTokenizer(get_tokenizer(), num_best=0, alpha=0) def make(start_msg): global Tokenizer sentence = start_msg
def kogpt_life_recursive_test(): config = get_config() AI_directory_path = "C:\\Users\\multicampus\\s02p23c104\\Back\\AI" kogpt2_config = get_kog_config() kogpt2_model_path = "C:\\Users\\multicampus\\s02p23c104\\Back\\AI\\checkpoints\\kogpt_life_model_20_2020-04-26-23-56-31.pth" kogpt2_vocab_path = AI_directory_path + config['kogpt_vocab_path'] kogpt2model = GPT2LMHeadModel(config=GPT2Config.from_dict(kogpt2_config)) torch.load(kogpt2_model_path) kogpt2model.load_state_dict(torch.load(kogpt2_model_path)) device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") kogpt2model.to(device) kogpt2model.eval() vocab = nlp.vocab.BERTVocab.from_sentencepiece(kogpt2_vocab_path, mask_token=None, sep_token=None, cls_token=None, unknown_token='<unk>', padding_token='<pad>', bos_token='<s>', eos_token='</s>') tok = SentencepieceTokenizer(kogpt2_vocab_path) # sent = ' 신랑 신부가 결혼식 파티 앞에서 사진을 찍기 위해 포즈를 취하고 있자, 신부님은 웨딩드레파란 셔츠를 입은 남자가 사다리에 서 있' # toked = tok(sent) # print(toked) # sent_cnt = 0 # input_ids = torch.tensor([vocab[vocab.bos_token], ] + vocab[toked]).unsqueeze(0) # input_ids = input_ids.to(device) korean_sentences = [ '신랑 신부가 결혼식 파티 앞에서 사진을 찍기 위해 포즈를 취하고 있다.', '파란 셔츠를 입은 남자가 사다리에 서 있다.', '두 남자가 서 있다' ] kogpt_input_sentences = [] for korean in korean_sentences: korean_size = len(korean) if not kogpt_input_sentences: korean_size = len(korean) if korean_size > 3: kogpt_input_sentences.append(korean[:-2]) elif korean_size > 1: kogpt_input_sentences.append(korean[:-1]) else: kogpt_input_sentences.append(korean) else: for i in range(len(kogpt_input_sentences)): if korean_size > 3: kogpt_input_sentences[i] += korean[:-2] elif korean_size > 1: kogpt_input_sentences[i] += korean[:-1] else: kogpt_input_sentences[i] += korean[:] kogpt_output_sentences = [] print(kogpt_input_sentences) expected_length = 50 for kogpt_input_sentence in kogpt_input_sentences: print(kogpt_input_sentence) toked = tok(kogpt_input_sentence) input_ids = torch.tensor([ vocab[vocab.bos_token], ] + vocab[toked]).unsqueeze(0) print(input_ids) input_ids = input_ids.to(device) input_length = input_ids.shape[1] outputs = kogpt2model.generate(input_ids=input_ids, max_length=input_length + expected_length, repetition_penalty=1.2, do_sample=True, num_beams=3, bos_token_id=0, pad_token_id=3, eos_token_id=1, num_return_sequences=3) for i in range(3): # 3 output sequences were generated toked = vocab.to_tokens(outputs[i].squeeze().tolist()) ret = re.sub(r'(<s>|</s>|<pad>|<unk>|)', '', ''.join(toked).replace('▁', ' ').strip()) kogpt_output_sentences.append(ret) kogpt_input_sentences = copy.deepcopy(kogpt_output_sentences) print(kogpt_input_sentences)
"모두의 연애": "<unused3>", "숭실대 에타": "<unused5>", "대학생 잡담방": "<unused4>" } os.system('ls') app = Flask(__name__) can_gpu = torch.cuda.is_available() # Model & Tokenizer loading tokenizer = sentencepiece.SentencePieceProcessor() tokenizer.load(tok_path) if can_gpu: device = torch.device('cuda') model = GPT2LMHeadModel.from_pretrained(pretrained_model_name_or_path=None, config=GPT2Config.from_dict(kogpt2_config), state_dict=torch.load(model_file)) else: device = torch.device('cpu') model = GPT2LMHeadModel.from_pretrained(pretrained_model_name_or_path=None, config=GPT2Config.from_dict(kogpt2_config), state_dict=torch.load(model_file, map_location=device)) model.to(device) requests_queue = Queue() # request queue. BATCH_SIZE = 1 # max request size. CHECK_INTERVAL = 0.1 ##
def main(): # See all possible arguments in src/transformers/training_args.py # or by passing the --help flag to this script. # We now keep distinct sets of args, for a cleaner separation of concerns. parser = HfArgumentParser( (ModelArguments, DataTrainingArguments, TrainingArguments)) model_args, data_args, training_args = parser.parse_args_into_dataclasses() print("training_args: ", training_args) if data_args.eval_data_file is None and training_args.do_eval: raise ValueError( "Cannot do evaluation without an evaluation data file. Either supply a file to --eval_data_file " "or remove the --do_eval argument.") if (os.path.exists(training_args.output_dir) and os.listdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir): raise ValueError( f"Output directory ({training_args.output_dir}) already exists and is not empty. Use --overwrite_output_dir to overcome." ) # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO if training_args.local_rank in [-1, 0] else logging.WARN, ) logger.warning( "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s", training_args.local_rank, training_args.device, training_args.n_gpu, bool(training_args.local_rank != -1), training_args.fp16, ) logger.info("Training/evaluation parameters %s", training_args) # Set seed set_seed(training_args.seed) # Load pretrained model and tokenizer # # Distributed training: # The .from_pretrained methods guarantee that only one local process can concurrently # download model & vocab. if model_args.config_name: if model_args.config_name == 'kogpt2': config = GPT2Config.from_dict(kogpt2_config) else: config = AutoConfig.from_pretrained(model_args.config_name, cache_dir=model_args.cache_dir) elif model_args.model_name_or_path: if model_args.model_name_or_path == 'kogpt2': config = GPT2Config.from_dict(kogpt2_config) else: config = AutoConfig.from_pretrained(model_args.model_name_or_path, cache_dir=model_args.cache_dir) else: config = CONFIG_MAPPING[model_args.model_type]() logger.warning( "You are instantiating a new config instance from scratch.") if model_args.tokenizer_name and model_args.use_gluonnlp_tokenizer == False: tokenizer = AutoTokenizer.from_pretrained( model_args.tokenizer_name, cache_dir=model_args.cache_dir) elif model_args.model_name_or_path and model_args.use_gluonnlp_tokenizer == False: tokenizer = AutoTokenizer.from_pretrained( model_args.model_name_or_path, cache_dir=model_args.cache_dir) elif model_args.use_gluonnlp_tokenizer == True: vocab_file = model_args.vocab_path vocab = nlp.vocab.BERTVocab.from_sentencepiece(vocab_file, mask_token=None, sep_token=None, cls_token=None, unknown_token='<unk>', padding_token='<pad>', bos_token='<s>', eos_token='</s>') tokenizer = nlp.data.BERTSPTokenizer(tokenizer_path, vocab, lower=False) else: raise ValueError( "You are instantiating a new tokenizer from scratch. This is not supported, but you can do it from another script, save it," "and load it from here, using --tokenizer_name") if model_args.model_name_or_path: if model_args.model_name_or_path == 'kogpt2': model = GPT2LMHeadModel.from_pretrained( pretrained_model_name_or_path=None, config=config, state_dict=torch.load(kogpt2_model_path)) else: model = AutoModelWithLMHead.from_pretrained( model_args.model_name_or_path, from_tf=bool(".ckpt" in model_args.model_name_or_path), config=config, cache_dir=model_args.cache_dir, ) else: logger.info("Training new model from scratch") model = AutoModelWithLMHead.from_config(config) if model_args.use_gluonnlp_tokenizer == False: model.resize_token_embeddings(len(tokenizer)) if config.model_type in ["bert", "roberta", "distilbert", "camembert" ] and not data_args.mlm: raise ValueError( "BERT and RoBERTa-like models do not have LM heads but masked LM heads. They must be run using the --mlm " "flag (masked language modeling).") if model_args.use_gluonnlp_tokenizer == True: max_len = model_args.max_len else: max_len = tokenizer.max_len if data_args.block_size <= 0: data_args.block_size = max_len # Our input block size will be the max possible for the model else: data_args.block_size = min(data_args.block_size, max_len) # Get datasets train_dataset = get_dataset( data_args, tokenizer=tokenizer, max_len=max_len) if training_args.do_train else None eval_dataset = get_dataset( data_args, tokenizer=tokenizer, max_len=max_len, evaluate=True) if training_args.do_eval else None data_collator = DataCollatorForLanguageModeling( tokenizer=tokenizer, mlm=data_args.mlm, mlm_probability=data_args.mlm_probability) # Initialize our Trainer trainer = Trainer( model=model, args=training_args, data_collator=data_collator, train_dataset=train_dataset, eval_dataset=eval_dataset, prediction_loss_only=True, ) # Training if training_args.do_train: model_path = (model_args.model_name_or_path if model_args.model_name_or_path is not None and os.path.isdir(model_args.model_name_or_path) else None) print("model_path: ", model_path) trainer.train(model_path=model_path) trainer.save_model() # For convenience, we also re-save the tokenizer to the same directory, # so that you can share your model easily on huggingface.co/models =) if trainer.is_world_master( ) and model_args.use_gluonnlp_tokenizer == False: tokenizer.save_pretrained(training_args.output_dir) # Evaluation results = {} if training_args.do_eval: logger.info("*** Evaluate ***") eval_output = trainer.evaluate() perplexity = math.exp(eval_output["eval_loss"]) result = {"perplexity": perplexity} output_eval_file = os.path.join(training_args.output_dir, "eval_results_lm.txt") if trainer.is_world_master(): with open(output_eval_file, "w") as writer: logger.info("***** Eval results *****") for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) results.update(result) return results