def download_model(outputdir_tokenizer: str, outputdir_pretrained: str): slow_tokenizer = ElectraTokenizer.from_pretrained("bert-base-uncased") print("Save tokenizer to ", outputdir_tokenizer) slow_tokenizer.save_pretrained(outputdir_tokenizer) model = ElectraForQuestionAnswering.from_pretrained( "google/electra-base-discriminator") model.save_pretrained(outputdir_pretrained) print("Save model electra pretrained to", outputdir_pretrained)
def create_model(self, transformer="longformer"): if transformer == "distilbert": from transformers import DistilBertForQuestionAnswering self.model = DistilBertForQuestionAnswering.from_pretrained( "distilbert-base-uncased") elif transformer == "bert": from transformers import BertForQuestionAnswering self.model = BertForQuestionAnswering.from_pretrained( "bert-base-uncased") elif transformer == "roberta": from transformers import RobertaForQuestionAnswering self.model = RobertaForQuestionAnswering.from_pretrained( "roberta-base") elif transformer == "roberta_squad": from transformers import RobertaForQuestionAnswering self.model = RobertaForQuestionAnswering.from_pretrained( "deepset/roberta-base-squad2") elif transformer == "longformer": from transformers import LongformerForQuestionAnswering self.model = LongformerForQuestionAnswering.from_pretrained( "allenai/longformer-base-4096") elif transformer == "bart": from transformers import BartForQuestionAnswering self.model = BartForQuestionAnswering.from_pretrained( "facebook/bart-base") elif transformer == "electra": from transformers import ElectraForQuestionAnswering self.model = ElectraForQuestionAnswering.from_pretrained( "google/electra-small-discriminator") else: print( "The model you chose is not available in this version. You can try to manually change the code or manually overwrite the variable self.model" ) print( "The available choices are 'distilbert' , 'bert' , 'roberta' , 'longformer' , 'bart' , 'electra' " )
def main(): parser = argparse.ArgumentParser() # Required parameters parser.add_argument("--model_type", default=None, type=str, required=True, help="Model type selected") parser.add_argument( "--model_name_or_path", default=None, type=str, required=True, help="Path to pre-trained model or shortcut name selected") parser.add_argument( "--output_dir", default=None, type=str, required=True, help= "The output directory where the model checkpoints and predictions will be written.", ) # Other parameters parser.add_argument( "--data_dir", default=None, type=str, help="The input data dir. Should contain the .json files for the task." + "If no data dir or train/predict files are specified, will run with tensorflow_datasets.", ) parser.add_argument( "--train_file", default=None, type=str, help= "The input training file. If a data dir is specified, will look for the file there" + "If no data dir or train/predict files are specified, will run with tensorflow_datasets.", ) parser.add_argument( "--predict_file", default=None, type=str, help= "The input evaluation file. If a data dir is specified, will look for the file there" + "If no data dir or train/predict files are specified, will run with tensorflow_datasets.", ) parser.add_argument( "--config_name", default="", type=str, help="Pretrained config name or path if not the same as model_name") parser.add_argument( "--tokenizer_name", default="", type=str, help="Pretrained tokenizer name or path if not the same as model_name", ) parser.add_argument( "--cache_dir", default="", type=str, help= "Where do you want to store the pre-trained models downloaded from s3", ) parser.add_argument( "--version_2_with_negative", action="store_true", help= "If true, the SQuAD examples contain some that do not have an answer.", ) parser.add_argument( "--null_score_diff_threshold", type=float, default=0.0, help= "If null_score - best_non_null is greater than the threshold predict null.", ) parser.add_argument( "--max_seq_length", default=384, type=int, help= "The maximum total input sequence length after WordPiece tokenization. Sequences " "longer than this will be truncated, and sequences shorter than this will be padded.", ) parser.add_argument( "--doc_stride", default=128, type=int, help= "When splitting up a long document into chunks, how much stride to take between chunks.", ) parser.add_argument( "--max_query_length", default=64, type=int, help= "The maximum number of tokens for the question. Questions longer than this will " "be truncated to this length.", ) parser.add_argument("--do_train", action="store_true", help="Whether to run training.") parser.add_argument("--do_eval", action="store_true", help="Whether to run eval on the dev set.") parser.add_argument( "--evaluate_during_training", default=True, action="store_true", help="Run evaluation during training at each logging step.") parser.add_argument( "--do_lower_case", action="store_true", help="Set this flag if you are using an uncased model.") parser.add_argument("--per_gpu_train_batch_size", default=8, type=int, help="Batch size per GPU/CPU for training.") parser.add_argument("--per_gpu_eval_batch_size", default=8, type=int, help="Batch size per GPU/CPU for evaluation.") parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument( "--gradient_accumulation_steps", type=int, default=1, help= "Number of updates steps to accumulate before performing a backward/update pass.", ) parser.add_argument("--weight_decay", default=0.0, type=float, help="Weight decay if we apply some.") parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.") parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument( "--max_steps", default=-1, type=int, help= "If > 0: set total number of training steps to perform. Override num_train_epochs.", ) parser.add_argument("--warmup_steps", default=0, type=int, help="Linear warmup over warmup_steps.") parser.add_argument( "--n_best_size", default=20, type=int, help= "The total number of n-best predictions to generate in the nbest_predictions.json output file.", ) parser.add_argument( "--max_answer_length", default=30, type=int, help= "The maximum length of an answer that can be generated. This is needed because the start " "and end predictions are not conditioned on one another.", ) parser.add_argument( "--verbose_logging", action="store_true", help= "If true, all of the warnings related to data processing will be printed. " "A number of warnings are expected for a normal SQuAD evaluation.", ) parser.add_argument("--logging_steps", type=int, default=100, help="Log every X updates steps.") parser.add_argument("--save_steps", type=int, default=10000, help="Save checkpoint every X updates steps.") parser.add_argument( "--eval_all_checkpoints", action="store_true", help= "Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number", ) parser.add_argument("--no_cuda", action="store_true", help="Whether not to use CUDA when available") parser.add_argument("--overwrite_output_dir", action="store_true", help="Overwrite the content of the output directory") parser.add_argument( "--overwrite_cache", action="store_true", help="Overwrite the cached training and evaluation sets") parser.add_argument("--seed", type=int, default=42, help="random seed for initialization") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument( "--fp16", action="store_true", help= "Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit", ) parser.add_argument( "--fp16_opt_level", type=str, default="O1", help= "For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']." "See details at https://nvidia.github.io/apex/amp.html", ) parser.add_argument("--server_ip", type=str, default="", help="Can be used for distant debugging.") parser.add_argument("--server_port", type=str, default="", help="Can be used for distant debugging.") parser.add_argument( "--threads", type=int, default=1, help="multiple threads for converting example to features") ### DO NOT MODIFY THIS BLOCK ### # arguments for nsml parser.add_argument('--pause', type=int, default=0) parser.add_argument('--mode', type=str, default='train') ################################ args = parser.parse_args() # for NSML args.data_dir = os.path.join(DATASET_PATH, args.data_dir) if (os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train and not args.overwrite_output_dir): raise ValueError( "Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome." .format(args.output_dir)) # Setup distant debugging if needed if args.server_ip and args.server_port: # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script import ptvsd print("Waiting for debugger attach") ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True) ptvsd.wait_for_attach() # Setup CUDA, GPU & distributed training if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") args.n_gpu = torch.cuda.device_count() logger.warning('IF args.n_gpu : ' + str(args.n_gpu) + ' / device : ' + str(device) + '\n') else: # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) torch.distributed.init_process_group(backend="nccl") args.n_gpu = 1 logger.warning('ELSE args.n_gpu : ' + str(args.n_gpu) + ' / device : ' + str(device) + '\n') args.device = device # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN, filename='log.log') logger.warning( "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s", args.local_rank, device, args.n_gpu, bool(args.local_rank != -1), args.fp16, ) # Set seed set_seed(args) # Load pretrained model and tokenizer if args.local_rank not in [-1, 0]: # Make sure only the first process in distributed training will download model & vocab torch.distributed.barrier() logger.warning("Model Loading ..") config = ElectraConfig.from_pretrained(args.model_name_or_path) model = ElectraForQuestionAnswering.from_pretrained( args.model_name_or_path, config=config) tokenizer = ElectraTokenizer.from_pretrained(args.model_name_or_path, do_lower_case=False) logger.warning("Model Loading Completed") if args.local_rank == 0: # Make sure only the first process in distributed training will download model & vocab torch.distributed.barrier() model.to(args.device) ### DO NOT MODIFY THIS BLOCK ### if IS_ON_NSML: bind_nsml(model, tokenizer, args) if args.pause: nsml.paused(scope=locals()) ################################ logger.info("Training/evaluation parameters %s", args) # Before we do anything with models, we want to ensure that we get fp16 execution of torch.einsum if args.fp16 is # set. Otherwise it'll default to "promote" mode, and we'll get fp32 operations. Note that running # `--fp16_opt_level="O2"` will remove the need for this code, but it is still valid. if args.fp16: try: import apex apex.amp.register_half_function(torch, "einsum") except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use fp16 training." ) # Training if args.do_train: train_dataset = load_and_cache_examples(args, tokenizer, evaluate=False, output_examples=False) global_step, tr_loss = train(args, train_dataset, model, tokenizer) logger.info(" global_step = %s, average loss = %s", global_step, tr_loss)
from flask import Flask, request, render_template import torch from transformers import ElectraTokenizer, ElectraForQuestionAnswering from transformers import DPRContextEncoder, DPRContextEncoderTokenizer from transformers import DPRQuestionEncoder, DPRQuestionEncoderTokenizer from tokenizers import BertWordPieceTokenizer import os os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE" from reader import get_answer model = ElectraForQuestionAnswering.from_pretrained("Reader/electra_QA").to( device=torch.device('cpu')) model.load_state_dict( torch.load('Reader/weight_electra/weights_3.pth', map_location=torch.device('cpu'))) model.eval() tokenizer = BertWordPieceTokenizer("Reader/electra_base_uncased/vocab.txt", lowercase=True) torch.set_grad_enabled(False) q_tokenizer = DPRQuestionEncoderTokenizer.from_pretrained( "facebook/dpr-question_encoder-single-nq-base") q_encoder = DPRQuestionEncoder.from_pretrained( "Retrieval/question_encoder").to(device=torch.device('cpu')) q_encoder.eval() # ctx_tokenizer = BertWordPieceTokenizer("ctx_tokenizer/vocab.txt", lowercase=True) ctx_tokenizer = DPRContextEncoderTokenizer.from_pretrained( "facebook/dpr-ctx_encoder-single-nq-base") ctx_encoder = DPRContextEncoder.from_pretrained("Retrieval/ctx_encoder").to(
def __init__(self) -> None: self.lists = {} # M-BERT from transformers import BertTokenizerFast, BertForMaskedLM self.bert_multilingual_tokenizer = BertTokenizerFast.from_pretrained( 'bert-base-multilingual-cased') self.bert_multilingual_model = BertForMaskedLM.from_pretrained( 'bert-base-multilingual-cased').eval() self.lists["M-BERT"] = { "Tokenizer": self.bert_multilingual_tokenizer, "Model": self.bert_multilingual_model } print("====================================") print("[BERT] Google Multilingual BERT loaded") print("====================================") # KR-BERT from transformers import BertTokenizerFast, BertForMaskedLM self.krbert_tokenizer = BertTokenizerFast.from_pretrained( 'snunlp/KR-Medium') self.krbert_model = BertForMaskedLM.from_pretrained( 'snunlp/KR-Medium').eval() self.lists["KR-Medium"] = { "Tokenizer": self.krbert_tokenizer, "Model": self.krbert_model } print("====================================") print("[BERT] KR-BERT loaded") print("====================================") # BERT from transformers import BertTokenizerFast, BertForMaskedLM self.bert_kor_tokenizer = BertTokenizerFast.from_pretrained( 'kykim/bert-kor-base') self.bert_kor_model = BertForMaskedLM.from_pretrained( 'kykim/bert-kor-base').eval() self.lists["bert-kor-base"] = { "Tokenizer": self.bert_kor_tokenizer, "Model": self.bert_kor_model } print("====================================") print("[BERT] BERT-kor-base loaded") print("====================================") # ALBERT from transformers import AlbertForMaskedLM self.albert_tokenizer = BertTokenizerFast.from_pretrained( 'kykim/albert-kor-base') self.albert_model = AlbertForMaskedLM.from_pretrained( 'kykim/albert-kor-base').eval() self.lists["albert-kor-base"] = { "Tokenizer": self.albert_tokenizer, "Model": self.albert_model } print("====================================") print("[BERT] ALBERT-kor-base loaded") print("====================================") # XLM-Roberta from transformers import XLMRobertaTokenizerFast, XLMRobertaForMaskedLM self.xlmroberta_tokenizer = XLMRobertaTokenizerFast.from_pretrained( 'xlm-roberta-base') self.xlmroberta_model = XLMRobertaForMaskedLM.from_pretrained( 'xlm-roberta-base').eval() self.lists["xlm-roberta-base"] = { "Tokenizer": self.xlmroberta_tokenizer, "Model": self.xlmroberta_model } print("====================================") print("[BERT] XLM-Roberta-kor loaded") print("====================================") from transformers import BertTokenizerFast, EncoderDecoderModel self.tokenizer_bertshared = BertTokenizerFast.from_pretrained( "kykim/bertshared-kor-base") self.bertshared_model = EncoderDecoderModel.from_pretrained( "kykim/bertshared-kor-base") self.lists["bertshared-kor-base"] = { "Tokenizer": self.tokenizer_bertshared, "Model": self.bertshared_model } print("====================================") print("[Seq2seq + BERT] bertshared-kor-base loaded") print("====================================") # gpt3-kor-small_based_on_gpt2 from transformers import BertTokenizerFast, GPT2LMHeadModel self.tokenizer_gpt3 = BertTokenizerFast.from_pretrained( "kykim/gpt3-kor-small_based_on_gpt2") self.model_gpt3 = GPT2LMHeadModel.from_pretrained( "kykim/gpt3-kor-small_based_on_gpt2") self.lists["gpt3-kor-small_based_on_gpt2"] = { "Tokenizer": self.tokenizer_gpt3, "Model": self.model_gpt3 } print("====================================") print("[GPT3] gpt3-small-based-on-gpt2 loaded") print("====================================") # electra-base-kor from transformers import ElectraTokenizerFast, ElectraModel self.tokenizer_electra = ElectraTokenizerFast.from_pretrained( "kykim/electra-kor-base") self.electra_model = ElectraModel.from_pretrained( "kykim/electra-kor-base") self.lists["electra-kor-base"] = { "Tokenizer": self.tokenizer_electra, "Model": self.electra_model } print("====================================") print("[ELECTRA] electra-kor-base loaded") print("====================================") from transformers import ElectraTokenizerFast, ElectraForQuestionAnswering self.electra_tokenizer_QA = ElectraTokenizerFast.from_pretrained( "monologg/koelectra-base-v3-finetuned-korquad") self.electra_model_QA = ElectraForQuestionAnswering.from_pretrained( "monologg/koelectra-base-v3-finetuned-korquad") self.lists["electra-kor-QA"] = { "Tokenizer": self.electra_tokenizer_QA, "Model": self.electra_model_QA } print("====================================") print("[ELECTRA] koelectra-base-v3-finetuned-korquad loaded") print("====================================")
def train_model(dir_tokenizer: str = None, dir_model: str = None, dir_data: str = None): batch_size = 16 epochs = 10 raw_train_data, raw_eval_data = load_data(dir_data) train_squad_examples = create_squad_examples(raw_train_data, "Creating training points", dir_tokenizer) x_train, y_train = create_inputs_targets(train_squad_examples) eval_squad_examples = create_squad_examples(raw_eval_data, "Creating evaluation points", dir_tokenizer) x_eval, y_eval = create_inputs_targets(eval_squad_examples) train_data = TensorDataset(torch.tensor(x_train[0], dtype=torch.int64), torch.tensor(x_train[1], dtype=torch.float), torch.tensor(x_train[2], dtype=torch.int64), torch.tensor(y_train[0], dtype=torch.int64), torch.tensor(y_train[1], dtype=torch.int64)) print(f"{len(train_data)} training points created.") train_sampler = RandomSampler(train_data) train_data_loader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size) eval_data = TensorDataset(torch.tensor(x_eval[0], dtype=torch.int64), torch.tensor(x_eval[1], dtype=torch.float), torch.tensor(x_eval[2], dtype=torch.int64), torch.tensor(y_eval[0], dtype=torch.int64), torch.tensor(y_eval[1], dtype=torch.int64)) print(f"{len(eval_data)} evaluation points created.") eval_sampler = SequentialSampler(eval_data) validation_data_loader = DataLoader(eval_data, sampler=eval_sampler, batch_size=batch_size) model = ElectraForQuestionAnswering.from_pretrained(dir_model) param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'gamma', 'beta'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay_rate': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay_rate': 0.0 }] optimizer = torch.optim.Adam(lr=1e-5, betas=(0.9, 0.98), eps=1e-9, params=optimizer_grouped_parameters) for epoch in range(1, epochs + 1): # ============================================ TRAINING ============================================================ print("Training epoch ", str(epoch)) training_pbar = tqdm(total=len(train_data), position=0, leave=True, file=sys.stdout, bar_format="{l_bar}%s{bar}%s{r_bar}" % (Fore.GREEN, Fore.RESET)) model.train() tr_loss = 0 nb_tr_steps = 0 for step, batch in enumerate(train_data_loader): batch = tuple(t for t in batch) input_word_ids, input_mask, input_type_ids, start_token_idx, end_token_idx = batch optimizer.zero_grad() output = model(input_ids=input_word_ids, attention_mask=input_mask, token_type_ids=input_type_ids, start_positions=start_token_idx, end_positions=end_token_idx) # print(loss) loss = output[0] loss.backward() optimizer.step() tr_loss += loss.item() nb_tr_steps += 1 training_pbar.update(input_word_ids.size(0)) training_pbar.close() print(f"\nTraining loss={tr_loss / nb_tr_steps:.4f}") torch.save(model.state_dict(), "./weights_" + str(epoch) + ".pth") # ============================================ VALIDATION ========================================================== validation_pbar = tqdm(total=len(eval_data), position=0, leave=True, file=sys.stdout, bar_format="{l_bar}%s{bar}%s{r_bar}" % (Fore.BLUE, Fore.RESET)) model.eval() eval_examples_no_skip = [ _ for _ in eval_squad_examples if _.skip is False ] currentIdx = 0 count = 0 for batch in validation_data_loader: batch = tuple(t for t in batch) input_word_ids, input_mask, input_type_ids, start_token_idx, end_token_idx = batch with torch.no_grad(): output_ = model(input_ids=input_word_ids, attention_mask=input_mask, token_type_ids=input_type_ids) # print(output_.start_logits) start_logits, end_logits = output_.start_logits, output_.end_logits pred_start, pred_end = start_logits.detach().cpu().numpy( ), end_logits.detach().cpu().numpy() for idx, (start, end) in enumerate(zip(pred_start, pred_end)): squad_eg = eval_examples_no_skip[currentIdx] currentIdx += 1 offsets = squad_eg.context_token_to_char start = np.argmax(start) end = np.argmax(end) if start >= len(offsets): continue pred_char_start = offsets[start][0] if end < len(offsets): pred_char_end = offsets[end][1] pred_ans = squad_eg.context[pred_char_start:pred_char_end] else: pred_ans = squad_eg.context[pred_char_start:] normalized_pred_ans = normalize_text(pred_ans) normalized_true_ans = [ normalize_text(_) for _ in squad_eg.all_answers ] if normalized_pred_ans in normalized_true_ans: count += 1 validation_pbar.update(input_word_ids.size(0)) acc = count / len(y_eval[0]) validation_pbar.close() print(f"\nEpoch={epoch}, exact match score={acc:.2f}")
from transformers import ElectraTokenizer, ElectraForQuestionAnswering, pipeline from pprint import pprint tokenizer = ElectraTokenizer.from_pretrained( "monologg/koelectra-small-v2-distilled-korquad-384") model = ElectraForQuestionAnswering.from_pretrained( "monologg/koelectra-small-v2-distilled-korquad-384") qa = pipeline("question-answering", tokenizer=tokenizer, model=model) pprint( qa({ "question": "한국의 대통령은 누구인가?", "context": "문재인 대통령은 28일 서울 코엑스에서 열린 ‘데뷰 (Deview) 2019’ 행사에 참석해 젊은 개발자들을 격려하면서 우리 정부의 인공지능 기본구상을 내놓았다.", }))
def load_model_tokenizer(path): return ElectraForQuestionAnswering.from_pretrained(path), \ ElectraTokenizerFast.from_pretrained(path)