def __init__(self, args, device='cpu'): print(args.bert_model) self.tokenizer = ElectraTokenizer.from_pretrained(args.bert_model) self.data_dir = args.data_dir file_list = get_json_file_list(args.data_dir) self.data = [] #max_article_len = 0 for file_name in file_list: data = json.loads(open(file_name, 'r').read()) data['high'] = 0 if ('high' in file_name): data['high'] = 1 self.data.append(data) #max_article_len = max(max_article_len, len(nltk.word_tokenize(data['article']))) self.data_objs = [] high_cnt = 0 middle_cnt = 0 for sample in self.data: high_cnt += sample['high'] middle_cnt += (1 - sample['high']) self.data_objs += self._create_sample(sample) #break print('high school sample:', high_cnt) print('middle school sample:', middle_cnt) for i in range(len(self.data_objs)): self.data_objs[i].convert_tokens_to_ids(self.tokenizer) #break torch.save(self.data_objs, args.save_name)
def load(cls, pretrained_model_name_or_path, tokenizer_class=None, use_fast=False, **kwargs): """ Enables loading of different Tokenizer classes with a uniform interface. Either infer the class from `pretrained_model_name_or_path` or define it manually via `tokenizer_class`. :param pretrained_model_name_or_path: The path of the saved pretrained model or its name (e.g. `bert-base-uncased`) :type pretrained_model_name_or_path: str :param tokenizer_class: (Optional) Name of the tokenizer class to load (e.g. `BertTokenizer`) :type tokenizer_class: str :param use_fast: (Optional, False by default) Indicate if FARM should try to load the fast version of the tokenizer (True) or use the Python one (False). Only DistilBERT, BERT and Electra fast tokenizers are supported. :type use_fast: bool :param kwargs: :return: Tokenizer """ pretrained_model_name_or_path = str(pretrained_model_name_or_path) # guess tokenizer type from name if tokenizer_class is None: if "albert" in pretrained_model_name_or_path.lower(): tokenizer_class = "AlbertTokenizer" elif "xlm-roberta" in pretrained_model_name_or_path.lower(): tokenizer_class = "XLMRobertaTokenizer" elif "roberta" in pretrained_model_name_or_path.lower(): tokenizer_class = "RobertaTokenizer" elif 'codebert' in pretrained_model_name_or_path.lower(): if "mlm" in pretrained_model_name_or_path.lower(): raise NotImplementedError( "MLM part of codebert is currently not supported in FARM" ) else: tokenizer_class = "RobertaTokenizer" elif "camembert" in pretrained_model_name_or_path.lower( ) or "umberto" in pretrained_model_name_or_path: tokenizer_class = "CamembertTokenizer" elif "distilbert" in pretrained_model_name_or_path.lower(): tokenizer_class = "DistilBertTokenizer" elif "bert" in pretrained_model_name_or_path.lower(): tokenizer_class = "BertTokenizer" elif "xlnet" in pretrained_model_name_or_path.lower(): tokenizer_class = "XLNetTokenizer" elif "electra" in pretrained_model_name_or_path.lower(): tokenizer_class = "ElectraTokenizer" elif "word2vec" in pretrained_model_name_or_path.lower() or \ "glove" in pretrained_model_name_or_path.lower() or \ "fasttext" in pretrained_model_name_or_path.lower(): tokenizer_class = "EmbeddingTokenizer" elif "minilm" in pretrained_model_name_or_path.lower(): tokenizer_class = "BertTokenizer" elif "dpr-question_encoder" in pretrained_model_name_or_path.lower( ): tokenizer_class = "DPRQuestionEncoderTokenizer" elif "dpr-ctx_encoder" in pretrained_model_name_or_path.lower(): tokenizer_class = "DPRContextEncoderTokenizer" else: raise ValueError( f"Could not infer tokenizer_class from name '{pretrained_model_name_or_path}'. Set " f"arg `tokenizer_class` in Tokenizer.load() to one of: AlbertTokenizer, " f"XLMRobertaTokenizer, RobertaTokenizer, DistilBertTokenizer, BertTokenizer, or " f"XLNetTokenizer.") logger.info(f"Loading tokenizer of type '{tokenizer_class}'") # return appropriate tokenizer object ret = None if tokenizer_class == "AlbertTokenizer": if use_fast: logger.error( 'AlbertTokenizerFast is not supported! Using AlbertTokenizer instead.' ) ret = AlbertTokenizer.from_pretrained( pretrained_model_name_or_path, keep_accents=True, **kwargs) else: ret = AlbertTokenizer.from_pretrained( pretrained_model_name_or_path, keep_accents=True, **kwargs) elif tokenizer_class == "XLMRobertaTokenizer": if use_fast: logger.error( 'XLMRobertaTokenizerFast is not supported! Using XLMRobertaTokenizer instead.' ) ret = XLMRobertaTokenizer.from_pretrained( pretrained_model_name_or_path, **kwargs) else: ret = XLMRobertaTokenizer.from_pretrained( pretrained_model_name_or_path, **kwargs) elif "RobertaTokenizer" in tokenizer_class: # because it also might be fast tokekenizer we use "in" if use_fast: logger.error( 'RobertaTokenizerFast is not supported! Using RobertaTokenizer instead.' ) ret = RobertaTokenizer.from_pretrained( pretrained_model_name_or_path, **kwargs) else: ret = RobertaTokenizer.from_pretrained( pretrained_model_name_or_path, **kwargs) elif "DistilBertTokenizer" in tokenizer_class: # because it also might be fast tokekenizer we use "in" if use_fast: ret = DistilBertTokenizerFast.from_pretrained( pretrained_model_name_or_path, **kwargs) else: ret = DistilBertTokenizer.from_pretrained( pretrained_model_name_or_path, **kwargs) elif "BertTokenizer" in tokenizer_class: # because it also might be fast tokekenizer we use "in" if use_fast: ret = BertTokenizerFast.from_pretrained( pretrained_model_name_or_path, **kwargs) else: ret = BertTokenizer.from_pretrained( pretrained_model_name_or_path, **kwargs) elif tokenizer_class == "XLNetTokenizer": if use_fast: logger.error( 'XLNetTokenizerFast is not supported! Using XLNetTokenizer instead.' ) ret = XLNetTokenizer.from_pretrained( pretrained_model_name_or_path, keep_accents=True, **kwargs) else: ret = XLNetTokenizer.from_pretrained( pretrained_model_name_or_path, keep_accents=True, **kwargs) elif "ElectraTokenizer" in tokenizer_class: # because it also might be fast tokekenizer we use "in" if use_fast: ret = ElectraTokenizerFast.from_pretrained( pretrained_model_name_or_path, **kwargs) else: ret = ElectraTokenizer.from_pretrained( pretrained_model_name_or_path, **kwargs) elif tokenizer_class == "EmbeddingTokenizer": if use_fast: logger.error( 'EmbeddingTokenizerFast is not supported! Using EmbeddingTokenizer instead.' ) ret = EmbeddingTokenizer.from_pretrained( pretrained_model_name_or_path, **kwargs) else: ret = EmbeddingTokenizer.from_pretrained( pretrained_model_name_or_path, **kwargs) elif tokenizer_class == "CamembertTokenizer": if use_fast: logger.error( 'CamembertTokenizerFast is not supported! Using CamembertTokenizer instead.' ) ret = CamembertTokenizer._from_pretrained( pretrained_model_name_or_path, **kwargs) else: ret = CamembertTokenizer._from_pretrained( pretrained_model_name_or_path, **kwargs) elif tokenizer_class == "DPRQuestionEncoderTokenizer" or tokenizer_class == "DPRQuestionEncoderTokenizerFast": if use_fast or tokenizer_class == "DPRQuestionEncoderTokenizerFast": ret = DPRQuestionEncoderTokenizerFast.from_pretrained( pretrained_model_name_or_path, **kwargs) else: ret = DPRQuestionEncoderTokenizer.from_pretrained( pretrained_model_name_or_path, **kwargs) elif tokenizer_class == "DPRContextEncoderTokenizer" or tokenizer_class == "DPRContextEncoderTokenizerFast": if use_fast or tokenizer_class == "DPRContextEncoderTokenizerFast": ret = DPRContextEncoderTokenizerFast.from_pretrained( pretrained_model_name_or_path, **kwargs) else: ret = DPRContextEncoderTokenizer.from_pretrained( pretrained_model_name_or_path, **kwargs) if ret is None: raise Exception("Unable to load tokenizer") else: return ret
def load(cls, pretrained_model_name_or_path, tokenizer_class=None, **kwargs): """ Enables loading of different Tokenizer classes with a uniform interface. Either infer the class from `pretrained_model_name_or_path` or define it manually via `tokenizer_class`. :param pretrained_model_name_or_path: The path of the saved pretrained model or its name (e.g. `bert-base-uncased`) :type pretrained_model_name_or_path: str :param tokenizer_class: (Optional) Name of the tokenizer class to load (e.g. `BertTokenizer`) :type tokenizer_class: str :param kwargs: :return: Tokenizer """ pretrained_model_name_or_path = str(pretrained_model_name_or_path) # guess tokenizer type from name if tokenizer_class is None: if "albert" in pretrained_model_name_or_path.lower(): tokenizer_class = "AlbertTokenizer" elif "xlm-roberta" in pretrained_model_name_or_path.lower(): tokenizer_class = "XLMRobertaTokenizer" elif "roberta" in pretrained_model_name_or_path.lower(): tokenizer_class = "RobertaTokenizer" elif "camembert" in pretrained_model_name_or_path.lower( ) or "umberto" in pretrained_model_name_or_path: tokenizer_class = "CamembertTokenizer" elif "distilbert" in pretrained_model_name_or_path.lower(): tokenizer_class = "DistilBertTokenizer" elif "bert" in pretrained_model_name_or_path.lower(): tokenizer_class = "BertTokenizer" elif "xlnet" in pretrained_model_name_or_path.lower(): tokenizer_class = "XLNetTokenizer" elif "electra" in pretrained_model_name_or_path.lower(): tokenizer_class = "ElectraTokenizer" elif "word2vec" in pretrained_model_name_or_path.lower() or \ "glove" in pretrained_model_name_or_path.lower() or \ "fasttext" in pretrained_model_name_or_path.lower(): tokenizer_class = "EmbeddingTokenizer" elif "minilm" in pretrained_model_name_or_path.lower(): tokenizer_class = "BertTokenizer" else: raise ValueError( f"Could not infer tokenizer_class from name '{pretrained_model_name_or_path}'. Set " f"arg `tokenizer_class` in Tokenizer.load() to one of: AlbertTokenizer, " f"XLMRobertaTokenizer, RobertaTokenizer, DistilBertTokenizer, BertTokenizer, or " f"XLNetTokenizer.") logger.info(f"Loading tokenizer of type '{tokenizer_class}'") # return appropriate tokenizer object if tokenizer_class == "AlbertTokenizer": ret = AlbertTokenizer.from_pretrained( pretrained_model_name_or_path, keep_accents=True, **kwargs) elif tokenizer_class == "XLMRobertaTokenizer": ret = XLMRobertaTokenizer.from_pretrained( pretrained_model_name_or_path, **kwargs) elif tokenizer_class == "RobertaTokenizer": ret = RobertaTokenizer.from_pretrained( pretrained_model_name_or_path, **kwargs) elif tokenizer_class == "DistilBertTokenizer": ret = DistilBertTokenizer.from_pretrained( pretrained_model_name_or_path, **kwargs) elif tokenizer_class == "BertTokenizer": ret = BertTokenizer.from_pretrained(pretrained_model_name_or_path, **kwargs) elif tokenizer_class == "XLNetTokenizer": ret = XLNetTokenizer.from_pretrained(pretrained_model_name_or_path, keep_accents=True, **kwargs) elif tokenizer_class == "ElectraTokenizer": ret = ElectraTokenizer.from_pretrained( pretrained_model_name_or_path, **kwargs) elif tokenizer_class == "EmbeddingTokenizer": ret = EmbeddingTokenizer.from_pretrained( pretrained_model_name_or_path, **kwargs) elif tokenizer_class == "CamembertTokenizer": ret = CamembertTokenizer._from_pretrained( pretrained_model_name_or_path, **kwargs) if ret is None: raise Exception("Unable to load tokenizer") else: return ret
def load(cls, pretrained_model_name_or_path, tokenizer_class=None, use_fast=False, **kwargs): """ Enables loading of different Tokenizer classes with a uniform interface. Either infer the class from model config or define it manually via `tokenizer_class`. :param pretrained_model_name_or_path: The path of the saved pretrained model or its name (e.g. `bert-base-uncased`) :type pretrained_model_name_or_path: str :param tokenizer_class: (Optional) Name of the tokenizer class to load (e.g. `BertTokenizer`) :type tokenizer_class: str :param use_fast: (Optional, False by default) Indicate if FARM should try to load the fast version of the tokenizer (True) or use the Python one (False). Only DistilBERT, BERT and Electra fast tokenizers are supported. :type use_fast: bool :param kwargs: :return: Tokenizer """ pretrained_model_name_or_path = str(pretrained_model_name_or_path) if tokenizer_class is None: tokenizer_class = cls._infer_tokenizer_class(pretrained_model_name_or_path) logger.info(f"Loading tokenizer of type '{tokenizer_class}'") # return appropriate tokenizer object ret = None if tokenizer_class == "AlbertTokenizer": if use_fast: logger.error('AlbertTokenizerFast is not supported! Using AlbertTokenizer instead.') ret = AlbertTokenizer.from_pretrained(pretrained_model_name_or_path, keep_accents=True, **kwargs) else: ret = AlbertTokenizer.from_pretrained(pretrained_model_name_or_path, keep_accents=True, **kwargs) elif tokenizer_class == "XLMRobertaTokenizer": if use_fast: logger.error('XLMRobertaTokenizerFast is not supported! Using XLMRobertaTokenizer instead.') ret = XLMRobertaTokenizer.from_pretrained(pretrained_model_name_or_path, **kwargs) else: ret = XLMRobertaTokenizer.from_pretrained(pretrained_model_name_or_path, **kwargs) elif "RobertaTokenizer" in tokenizer_class: # because it also might be fast tokekenizer we use "in" if use_fast: logger.error('RobertaTokenizerFast is not supported! Using RobertaTokenizer instead.') ret = RobertaTokenizer.from_pretrained(pretrained_model_name_or_path, **kwargs) else: ret = RobertaTokenizer.from_pretrained(pretrained_model_name_or_path, **kwargs) elif "DistilBertTokenizer" in tokenizer_class: # because it also might be fast tokekenizer we use "in" if use_fast: ret = DistilBertTokenizerFast.from_pretrained(pretrained_model_name_or_path, **kwargs) else: ret = DistilBertTokenizer.from_pretrained(pretrained_model_name_or_path, **kwargs) elif "BertTokenizer" in tokenizer_class: # because it also might be fast tokekenizer we use "in" if use_fast: ret = BertTokenizerFast.from_pretrained(pretrained_model_name_or_path, **kwargs) else: ret = BertTokenizer.from_pretrained(pretrained_model_name_or_path, **kwargs) elif tokenizer_class == "XLNetTokenizer": if use_fast: logger.error('XLNetTokenizerFast is not supported! Using XLNetTokenizer instead.') ret = XLNetTokenizer.from_pretrained(pretrained_model_name_or_path, keep_accents=True, **kwargs) else: ret = XLNetTokenizer.from_pretrained(pretrained_model_name_or_path, keep_accents=True, **kwargs) elif "ElectraTokenizer" in tokenizer_class: # because it also might be fast tokekenizer we use "in" if use_fast: ret = ElectraTokenizerFast.from_pretrained(pretrained_model_name_or_path, **kwargs) else: ret = ElectraTokenizer.from_pretrained(pretrained_model_name_or_path, **kwargs) elif tokenizer_class == "EmbeddingTokenizer": if use_fast: logger.error('EmbeddingTokenizerFast is not supported! Using EmbeddingTokenizer instead.') ret = EmbeddingTokenizer.from_pretrained(pretrained_model_name_or_path, **kwargs) else: ret = EmbeddingTokenizer.from_pretrained(pretrained_model_name_or_path, **kwargs) elif tokenizer_class == "CamembertTokenizer": if use_fast: logger.error('CamembertTokenizerFast is not supported! Using CamembertTokenizer instead.') ret = CamembertTokenizer._from_pretrained(pretrained_model_name_or_path, **kwargs) else: ret = CamembertTokenizer._from_pretrained(pretrained_model_name_or_path, **kwargs) elif tokenizer_class == "DPRQuestionEncoderTokenizer" or tokenizer_class == "DPRQuestionEncoderTokenizerFast": if use_fast or tokenizer_class == "DPRQuestionEncoderTokenizerFast": ret = DPRQuestionEncoderTokenizerFast.from_pretrained(pretrained_model_name_or_path, **kwargs) else: ret = DPRQuestionEncoderTokenizer.from_pretrained(pretrained_model_name_or_path, **kwargs) elif tokenizer_class == "DPRContextEncoderTokenizer" or tokenizer_class == "DPRContextEncoderTokenizerFast": if use_fast or tokenizer_class == "DPRContextEncoderTokenizerFast": ret = DPRContextEncoderTokenizerFast.from_pretrained(pretrained_model_name_or_path, **kwargs) else: ret = DPRContextEncoderTokenizer.from_pretrained(pretrained_model_name_or_path, **kwargs) if ret is None: raise Exception("Unable to load tokenizer") else: return ret
def analyze(self): # electra config 객체 생성 electra_config = ElectraConfig.from_pretrained( os.path.join(self.config["model_dir_path"], "checkpoint-{}".format(self.config["checkpoint"])), num_labels=self.config["senti_labels"], cache_dir=None) # electra tokenizer 객체 생성 electra_tokenizer = ElectraTokenizer.from_pretrained( os.path.join(self.config["model_dir_path"], "checkpoint-{}".format(self.config["checkpoint"])), do_lower_case=False, cache_dir=None) # electra model 객체 생성 electra_model = ElectraForSequenceClassification.from_pretrained( os.path.join(self.config["model_dir_path"], "checkpoint-{}".format(self.config["checkpoint"])), config=electra_config, lstm_hidden=self.config['lstm_hidden'], label_emb_size=self.config['lstm_hidden'] * 2, score_emb_size=self.config['lstm_hidden'] * 2, score_size=self.config['score_labels'], num_layer=self.config['lstm_num_layer'], bilstm_flag=self.config['bidirectional_flag'], cache_dir=self.config["cache_dir_path"]) electra_model.cuda() # 평가 데이터 읽기 test_datas = preprocessing.read_data( file_path=self.config["analyze_data_path"], mode=self.config["mode"]) # 평가 데이터 전처리 test_dataset = preprocessing.convert_data2dataset( datas=test_datas, tokenizer=electra_tokenizer, max_length=self.config["max_length"], labels=self.config["senti_labels"], score_labels=self.config["score_labels"], mode=self.config["mode"]) # 평가 데이터를 batch 단위로 추출하기 위한 DataLoader 객체 생성 test_sampler = SequentialSampler(test_dataset) test_dataloader = DataLoader(test_dataset, sampler=test_sampler, batch_size=100) electra_model.eval() # 평가 데이터에 대한 정확도와 모델의 입력, 출력, 정답 test_accuracy, total_input_ids, total_predicts, total_corrects = self.do_analyze( electra_model=electra_model, test_dataloader=test_dataloader, mode=self.config["mode"]) print("test_accuracy : {}\n".format(round(test_accuracy, 4))) print("테스트 데이터 10개에 대하여 모델 출력과 정답을 비교") # 10개의 평가 케이스에 대하여 모델 출력과 정답 비교 self.show_result(total_input_ids=total_input_ids[:10], total_predicts=total_predicts[:10], total_corrects=total_corrects[:10], tokenizer=electra_tokenizer)
def train(self): ######################################################################################################################################### # electra config 객체 생성 electra_config = ElectraConfig.from_pretrained( "/home/mongjin/KuELECTRA_base", num_labels=self.config["senti_labels"], cache_dir=self.config["cache_dir_path"]) # electra tokenizer 객체 생성 electra_tokenizer = ElectraTokenizer.from_pretrained( "/home/mongjin/KuELECTRA_base", do_lower_case=False, cache_dir=self.config["cache_dir_path"]) # electra model 객체 생성 electra_model = ElectraForSequenceClassification.from_pretrained( "/home/mongjin/KuELECTRA_base", config=electra_config, lstm_hidden=self.config['lstm_hidden'], label_emb_size=self.config['lstm_hidden'] * 2, score_emb_size=self.config['lstm_hidden'] * 2, score_size=self.config['score_labels'], num_layer=self.config['lstm_num_layer'], bilstm_flag=self.config['bidirectional_flag'], cache_dir=self.config["cache_dir_path"], from_tf=True) ######################################################################################################################################### electra_model.cuda() # 학습 데이터 읽기 train_datas = preprocessing.read_data( file_path=self.config["train_data_path"], mode=self.config["mode"]) # 학습 데이터 전처리 train_dataset = preprocessing.convert_data2dataset( datas=train_datas, tokenizer=electra_tokenizer, max_length=self.config["max_length"], labels=self.config["senti_labels"], score_labels=self.config["score_labels"], mode=self.config["mode"]) # 학습 데이터를 batch 단위로 추출하기 위한 DataLoader 객체 생성 train_sampler = RandomSampler(train_dataset) train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=self.config["batch_size"]) # 평가 데이터 읽기 test_datas = preprocessing.read_data( file_path=self.config["test_data_path"], mode=self.config["mode"]) # 평가 데이터 전처리 test_dataset = preprocessing.convert_data2dataset( datas=test_datas, tokenizer=electra_tokenizer, max_length=self.config["max_length"], labels=self.config["senti_labels"], score_labels=self.config["score_labels"], mode=self.config["mode"]) # 평가 데이터를 batch 단위로 추출하기 위한 DataLoader 객체 생성 test_sampler = SequentialSampler(test_dataset) test_dataloader = DataLoader(test_dataset, sampler=test_sampler, batch_size=100) # 전체 학습 횟수(batch 단위) t_total = len(train_dataloader) // self.config[ "gradient_accumulation_steps"] * self.config["epoch"] # 모델 학습을 위한 optimizer no_decay = ['bias', 'LayerNorm.weight'] optimizer = AdamW([{ 'params': [ p for n, p in electra_model.named_parameters() if not any(nd in n for nd in no_decay) ], 'lr': 5e-5, 'weight_decay': self.config['weight_decay'] }, { 'params': [ p for n, p in electra_model.named_parameters() if any(nd in n for nd in no_decay) ], 'lr': 5e-5, 'weight_decay': 0.0 }]) # optimizer = AdamW(lan.parameters(), lr=self.config['learning_rate'], eps=self.config['adam_epsilon']) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=self.config["warmup_steps"], num_training_steps=t_total) if os.path.isfile( os.path.join(self.config["model_dir_path"], "optimizer.pt")) and os.path.isfile( os.path.join(self.config["model_dir_path"], "scheduler.pt")): # 기존에 학습했던 optimizer와 scheduler의 정보 불러옴 optimizer.load_state_dict( torch.load( os.path.join(self.config["model_dir_path"], "optimizer.pt"))) scheduler.load_state_dict( torch.load( os.path.join(self.config["model_dir_path"], "scheduler.pt"))) print( "####################### Success Load Model ###########################" ) global_step = 0 electra_model.zero_grad() max_test_accuracy = 0 for epoch in range(self.config["epoch"]): electra_model.train() # 학습 데이터에 대한 정확도와 평균 loss train_accuracy, average_loss, global_step, score_acc = self.do_train( electra_model=electra_model, optimizer=optimizer, scheduler=scheduler, train_dataloader=train_dataloader, epoch=epoch + 1, global_step=global_step) print("train_accuracy : {}\taverage_loss : {}\n".format( round(train_accuracy, 4), round(average_loss, 4))) print("train_score_accuracy :", "{:.6f}".format(score_acc)) electra_model.eval() # 평가 데이터에 대한 정확도 test_accuracy, score_acc = self.do_evaluate( electra_model=electra_model, test_dataloader=test_dataloader, mode=self.config["mode"]) print("test_accuracy : {}\n".format(round(test_accuracy, 4))) print("test_score_accuracy :", "{:.6f}".format(score_acc)) # 현재의 정확도가 기존 정확도보다 높은 경우 모델 파일 저장 if (max_test_accuracy < test_accuracy): max_test_accuracy = test_accuracy output_dir = os.path.join(self.config["model_dir_path"], "checkpoint-{}".format(global_step)) if not os.path.exists(output_dir): os.makedirs(output_dir) electra_config.save_pretrained(output_dir) electra_tokenizer.save_pretrained(output_dir) electra_model.save_pretrained(output_dir) # torch.save(lan.state_dict(), os.path.join(output_dir, "lan.pt")) torch.save(optimizer.state_dict(), os.path.join(output_dir, "optimizer.pt")) torch.save(scheduler.state_dict(), os.path.join(output_dir, "scheduler.pt")) print("max_test_accuracy :", "{:.6f}".format(round(max_test_accuracy, 4)))
from __future__ import print_function from collections import Counter import string import re import argparse import json import sys import os from bs4 import BeautifulSoup '''KorQuAD 2.0에 대한 공식 평가 스크립트 ''' '''본 스크립트는 SQuAD v1.1 평가 스크립트 https://rajpurkar.github.io/SQuAD-explorer/ 를 바탕으로 작성됨.''' from transformers.tokenization_electra import ElectraTokenizer tokenizer = ElectraTokenizer.from_pretrained( "../../baseline/checkpoint-24000", do_lower_case=False, ) def normalize_answer(s): def tag_clean(t): return BeautifulSoup(t).get_text() def remove_(text): ''' 불필요한 기호 제거 ''' text = re.sub("'", " ", text) text = re.sub('"', " ", text) text = re.sub('《', " ", text) text = re.sub('》', " ", text) text = re.sub('<', " ", text) text = re.sub('>', " ", text) text = re.sub('〈', " ", text)