def load_bert(model_path="bert/model/pytorch_model.bin", config_file="bert/config_parameters/config.json"): print("Loading BERT-model...") config = BertConfig(config_file) model = BertForQuestionAnswering(config) model.load_state_dict( torch.load(model_path, map_location=torch.device("cpu"))) print("Model loaded.\n\n") return model
def bertForQuestionAnswering(*args, **kwargs): """ BertForQuestionAnswering is a fine-tuning model that includes BertModel with a token-level classifiers on top of the full sequence of last hidden states. Note that the classification head is only initialized and has to be trained. Example: # Load the tokenizer >>> import torch >>> tokenizer = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'bertTokenizer', 'bert-base-cased', do_basic_tokenize=False) # Prepare tokenized input >>> text = "[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]" >>> tokenized_text = tokenizer.tokenize(text) >>> indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text) >>> segments_ids = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1] >>> tokens_tensor = torch.tensor([indexed_tokens]) >>> segments_tensors = torch.tensor([segments_ids]) # Load bertForQuestionAnswering >>> model = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'bertForQuestionAnswering', 'bert-base-cased') >>> model.eval() # Predict the start and end positions logits >>> with torch.no_grad(): start_logits, end_logits = model(tokens_tensor, segments_tensors) # Or get the total loss which is the sum of the CrossEntropy loss for the start and end token positions >>> start_positions, end_positions = torch.tensor([12]), torch.tensor([14]) # set model.train() before if training this loss >>> multiple_choice_loss = model(tokens_tensor, segments_tensors, start_positions=start_positions, end_positions=end_positions) """ model = BertForQuestionAnswering.from_pretrained(*args, **kwargs) return model
def bertForQuestionAnswering(*args, **kwargs): """ BertForQuestionAnswering is a fine-tuning model that includes BertModel with a token-level classifiers on top of the full sequence of last hidden states. """ model = BertForQuestionAnswering.from_pretrained(*args, **kwargs) return model
def __init__(self): self.log = {} self.device = torch.device( "cuda" if torch.cuda.is_available() else "cpu") self.tokenizer = BertTokenizer.from_pretrained( args.bert_model, do_lower_case=args.do_lower_case) self.model = BertForQuestionAnswering.from_pretrained(args.bert_model) self.model.to(self.device) self.model.eval() self.args = args
def test_BertForQuestionAnswering(): input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]]) input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]]) token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]]) config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768, num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072) model = BertForQuestionAnswering(config) print(model(input_ids, token_type_ids, input_mask))
def __init__(self, token_makers, pretrained_model_name=None, answer_maxlen=30): super(BertForQA, self).__init__(token_makers) self.bert = True # for optimizer's model parameters self.answer_maxlen = answer_maxlen self.model = BertForQuestionAnswering.from_pretrained( pretrained_model_name, cache_dir=str(CachePath.ROOT)) self.criterion = nn.CrossEntropyLoss()
def __init__(self, word_vectors, char_vectors, hidden_size, drop_prob): super(Embedding, self).__init__() self.drop_prob = drop_prob # self.embed = nn.Embedding.from_pretrained(word_vectors) # This is actually BERT self.embed = BertForQuestionAnswering.from_pretrained( 'bert-large-uncased') for name, param in self.embed.named_parameters(): param.requires_grad = False self.embed_char = CNNEmbeddings(char_vectors=char_vectors, embed_size=64) # added_flag # self.proj = nn.Linear(word_vectors.size(1), hidden_size, bias=False) self.proj = nn.Linear(64 + 1024, hidden_size, bias=False) # added_flag hardcoded self.hwy = HighwayEncoder(2, hidden_size)
def start(): app = Flask(__name__) host = "0.0.0.0" port = 8000 debug = True parser = argparse.ArgumentParser() parser.add_argument( "--max_query_length", default=64, type=int, help= "The maximum number of tokens for the question. Questions longer than this will " "be truncated to this length.") parser.add_argument("--predict_batch_size", default=8, type=int, help="Total batch size for predictions.") parser.add_argument( "--verbose_logging", action='store_true', help= "If true, all of the warnings related to data processing will be printed. " "A number of warnings are expected for a normal SQuAD evaluation.") parser.add_argument("--no_cuda", action='store_true', help="Whether not to use CUDA when available") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument( '--gradient_accumulation_steps', type=int, default=1, help= "Number of updates steps to accumulate before performing a backward/update pass." ) parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument( '--fp16', action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument( '--loss_scale', type=float, default=0, help= "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.\n") parser.add_argument( '--null_score_diff_threshold', type=float, default=0.0, help= "If null_score - best_non_null is greater than the threshold predict null." ) args = parser.parse_args() if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) tokenizer = BertTokenizer.from_pretrained("bert-base-uncased", do_lower_case=True) config = BertConfig("./output/config.json") model = BertForQuestionAnswering(config) model.load_state_dict( torch.load("./output/pytorch_model.bin", map_location='cpu')) model.to(device) @app.route('/', methods=['POST']) def filter(): dat_in = { "index": 2, "original_sentence": "existing image captioning models do not generalize well to out-of-domain images containing novel scenes or objects . this limitation severely hinders the use of these models in real world applications dealing with images in the wild . we address this problem using a flexible approach that enables existing deep captioning architectures to take advantage of image taggers at test time , without re-training . our method uses constrained beam search to force the inclusion of selected tag words in the output , and fixed , pretrained word embeddings to facilitate vocabulary expansion to previously unseen tag words . using this approach we achieve state of the art results for out-of-domain captioning on mscoco -LRB- and improved results for in-domain captioning -RRB- . perhaps surprisingly , our results significantly outperform approaches that incorporate the same tag predictions into the learning algorithm . we also show that we can significantly improve the quality of generated imagenet captions by leveraging ground-truth labels . ", "tagged_sentence": "existing│O_ANS image│O_ANS captioning│O_ANS models│O_ANS do│O_ANS not│O_ANS generalize│O_ANS well│O_ANS to│O_ANS out-of-domain│O_ANS images│O_ANS containing│O_ANS novel│O_ANS scenes│O_ANS or│O_ANS objects│O_ANS .│O_ANS this│O_ANS limitation│O_ANS severely│O_ANS hinders│O_ANS the│O_ANS use│O_ANS of│O_ANS these│O_ANS models│O_ANS in│O_ANS real│O_ANS world│O_ANS applications│O_ANS dealing│O_ANS with│O_ANS images│O_ANS in│O_ANS the│O_ANS wild│O_ANS .│O_ANS we│O_ANS address│O_ANS this│O_ANS problem│O_ANS using│O_ANS a│O_ANS flexible│O_ANS approach│O_ANS that│O_ANS enables│O_ANS existing│O_ANS deep│O_ANS captioning│O_ANS architectures│O_ANS to│O_ANS take│O_ANS advantage│O_ANS of│O_ANS image│O_ANS taggers│O_ANS at│O_ANS test│O_ANS time│O_ANS ,│O_ANS without│O_ANS re-training│O_ANS .│O_ANS our│O_ANS method│O_ANS uses│O_ANS constrained│O_ANS beam│O_ANS search│O_ANS to│O_ANS force│O_ANS the│O_ANS inclusion│O_ANS of│O_ANS selected│O_ANS tag│O_ANS words│O_ANS in│O_ANS the│O_ANS output│O_ANS ,│O_ANS and│O_ANS fixed│O_ANS ,│O_ANS pretrained│O_ANS word│B_ANS embeddings│I_ANS to│O_ANS facilitate│O_ANS vocabulary│O_ANS expansion│O_ANS to│O_ANS previously│O_ANS unseen│O_ANS tag│O_ANS words│O_ANS .│O_ANS using│O_ANS this│O_ANS approach│O_ANS we│O_ANS achieve│O_ANS state│O_ANS of│O_ANS the│O_ANS art│O_ANS results│O_ANS for│O_ANS out-of-domain│O_ANS captioning│O_ANS on│O_ANS mscoco│O_ANS -LRB-│O_ANS and│O_ANS improved│O_ANS results│O_ANS for│O_ANS in-domain│O_ANS captioning│O_ANS -RRB-│O_ANS .│O_ANS perhaps│O_ANS surprisingly│O_ANS ,│O_ANS our│O_ANS results│O_ANS significantly│O_ANS outperform│O_ANS approaches│O_ANS that│O_ANS incorporate│O_ANS the│O_ANS same│O_ANS tag│O_ANS predictions│O_ANS into│O_ANS the│O_ANS learning│O_ANS algorithm│O_ANS .│O_ANS we│O_ANS also│O_ANS show│O_ANS that│O_ANS we│O_ANS can│O_ANS significantly│O_ANS improve│O_ANS the│O_ANS quality│O_ANS of│O_ANS generated│O_ANS imagenet│O_ANS captions│O_ANS by│O_ANS leveraging│O_ANS ground-truth│O_ANS labels│O_ANS .│O_ANS ", "answer": "word embeddings", "question": [ "What does pretrained stand for ?", "What is pretrained ?", "What does re-training stand for ?", "What is the pretrained ?", "What is the term for pretrained ?" ], "score": [ -2.3564553260803223, -3.8269970417022705, -4.229936122894287, -5.298074722290039, -5.689377307891846 ] } eval_examples = read_squad_examples(input_data=dat_in, is_training=False, version_2_with_negative=True) eval_features = convert_examples_to_features( examples=eval_examples, tokenizer=tokenizer, max_seq_length=384, doc_stride=128, max_query_length=args.max_query_length, is_training=False) all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long) all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_example_index) # Run prediction for full data eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.predict_batch_size) model.eval() all_results = [] logger.info("Start evaluating") for input_ids, input_mask, segment_ids, example_indices in tqdm( eval_dataloader, desc="Evaluating", disable=args.local_rank not in [-1, 0]): if len(all_results) % 1000 == 0: logger.info("Processing example: %d" % (len(all_results))) input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) with torch.no_grad(): batch_start_logits, batch_end_logits = model( input_ids, segment_ids, input_mask) for i, example_index in enumerate(example_indices): start_logits = batch_start_logits[i].detach().cpu().tolist() end_logits = batch_end_logits[i].detach().cpu().tolist() eval_feature = eval_features[example_index.item()] unique_id = int(eval_feature.unique_id) all_results.append( RawResult(unique_id=unique_id, start_logits=start_logits, end_logits=end_logits)) result = write_predictions(eval_examples, eval_features, all_results, 20, 30, True, args.verbose_logging, True, args.null_score_diff_threshold) # inputs = request.get_json(force=True) return result app.run(debug=debug, host=host, port=port, use_reloader=False, threaded=True)
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument("--bert_model", default=None, type=str, required=True, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, " "bert-base-multilingual-cased, bert-base-chinese.") parser.add_argument("--output_dir", default=None, type=str, required=True, help="The output directory where the model checkpoints and predictions will be written.") ## Other parameters parser.add_argument("--train_file", default=None, type=str, help="SQuAD json for training. E.g., train-v1.1.json") parser.add_argument("--predict_file", default=None, type=str, help="SQuAD json for predictions. E.g., dev-v1.1.json or test-v1.1.json") parser.add_argument("--max_seq_length", default=384, type=int, help="The maximum total input sequence length after WordPiece tokenization. Sequences " "longer than this will be truncated, and sequences shorter than this will be padded.") parser.add_argument("--doc_stride", default=128, type=int, help="When splitting up a long document into chunks, how much stride to take between chunks.") parser.add_argument("--max_query_length", default=64, type=int, help="The maximum number of tokens for the question. Questions longer than this will " "be truncated to this length.") parser.add_argument("--do_train", action='store_true', help="Whether to run training.") parser.add_argument("--do_predict", action='store_true', help="Whether to run eval on the dev set.") parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.") parser.add_argument("--predict_batch_size", default=8, type=int, help="Total batch size for predictions.") parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument("--warmup_proportion", default=0.1, type=float, help="Proportion of training to perform linear learning rate warmup for. E.g., 0.1 = 10%% " "of training.") parser.add_argument("--n_best_size", default=20, type=int, help="The total number of n-best predictions to generate in the nbest_predictions.json " "output file.") parser.add_argument("--max_answer_length", default=30, type=int, help="The maximum length of an answer that can be generated. This is needed because the start " "and end predictions are not conditioned on one another.") parser.add_argument("--verbose_logging", action='store_true', help="If true, all of the warnings related to data processing will be printed. " "A number of warnings are expected for a normal SQuAD evaluation.") parser.add_argument("--no_cuda", action='store_true', help="Whether not to use CUDA when available") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument('--gradient_accumulation_steps', type=int, default=1, help="Number of updates steps to accumulate before performing a backward/update pass.") parser.add_argument("--do_lower_case", action='store_true', help="Whether to lower case the input text. True for uncased models, False for cased models.") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument('--fp16', action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument('--overwrite_output_dir', action='store_true', help="Overwrite the content of the output directory") parser.add_argument('--loss_scale', type=float, default=0, help="Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.\n") parser.add_argument('--version_2_with_negative', action='store_true', help='If true, the SQuAD examples contain some that do not have an answer.') parser.add_argument('--null_score_diff_threshold', type=float, default=0.0, help="If null_score - best_non_null is greater than the threshold predict null.") parser.add_argument('--server_ip', type=str, default='', help="Can be used for distant debugging.") parser.add_argument('--server_port', type=str, default='', help="Can be used for distant debugging.") args = parser.parse_args() print(args) if args.server_ip and args.server_port: # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script import ptvsd print("Waiting for debugger attach") ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True) ptvsd.wait_for_attach() if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s - %(message)s', datefmt = '%m/%d/%Y %H:%M:%S', level = logging.INFO if args.local_rank in [-1, 0] else logging.WARN) logger.info("device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".format( device, n_gpu, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError("Invalid gradient_accumulation_steps parameter: {}, should be >= 1".format( args.gradient_accumulation_steps)) args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if not args.do_train and not args.do_predict: raise ValueError("At least one of `do_train` or `do_predict` must be True.") if args.do_train: if not args.train_file: raise ValueError( "If `do_train` is True, then `train_file` must be specified.") if args.do_predict: if not args.predict_file: raise ValueError( "If `do_predict` is True, then `predict_file` must be specified.") if os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train and not args.overwrite_output_dir: raise ValueError("Output directory () already exists and is not empty.") if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) if args.local_rank not in [-1, 0]: torch.distributed.barrier() # Make sure only the first process in distributed training will download model & vocab tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) model = BertForQuestionAnswering.from_pretrained(args.bert_model) if args.local_rank == 0: torch.distributed.barrier() if args.fp16: model.half() model.to(device) if args.local_rank != -1: model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True) elif n_gpu > 1: model = torch.nn.DataParallel(model) if args.do_train: if args.local_rank in [-1, 0]: tb_writer = SummaryWriter() # Prepare data loader train_examples = read_squad_examples( input_file=args.train_file, is_training=True, version_2_with_negative=args.version_2_with_negative) cached_train_features_file = args.train_file+'_{0}_{1}_{2}_{3}'.format( list(filter(None, args.bert_model.split('/'))).pop(), str(args.max_seq_length), str(args.doc_stride), str(args.max_query_length)) try: with open(cached_train_features_file, "rb") as reader: train_features = pickle.load(reader) except: train_features = convert_examples_to_features( examples=train_examples, tokenizer=tokenizer, max_seq_length=args.max_seq_length, doc_stride=args.doc_stride, max_query_length=args.max_query_length, is_training=True) if args.local_rank == -1 or torch.distributed.get_rank() == 0: logger.info(" Saving train features into cached file %s", cached_train_features_file) with open(cached_train_features_file, "wb") as writer: pickle.dump(train_features, writer) all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long) all_start_positions = torch.tensor([f.start_position for f in train_features], dtype=torch.long) all_end_positions = torch.tensor([f.end_position for f in train_features], dtype=torch.long) train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_start_positions, all_end_positions) if args.local_rank == -1: train_sampler = RandomSampler(train_data) else: train_sampler = DistributedSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) num_train_optimization_steps = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs # if args.local_rank != -1: # num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size() # Prepare optimizer param_optimizer = list(model.named_parameters()) # hack to remove pooler, which is not used # thus it produce None grad that break apex param_optimizer = [n for n in param_optimizer if 'pooler' not in n[0]] no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [ {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01}, {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} ] if args.fp16: try: from apex.optimizers import FP16_Optimizer from apex.optimizers import FusedAdam except ImportError: raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.") optimizer = FusedAdam(optimizer_grouped_parameters, lr=args.learning_rate, bias_correction=False, max_grad_norm=1.0) if args.loss_scale == 0: optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) else: optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale) warmup_linear = WarmupLinearSchedule(warmup=args.warmup_proportion, t_total=num_train_optimization_steps) else: optimizer = BertAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=num_train_optimization_steps) global_step = 0 logger.info("***** Running training *****") logger.info(" Num orig examples = %d", len(train_examples)) logger.info(" Num split examples = %d", len(train_features)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_optimization_steps) model.train() for epoch in trange(int(args.num_train_epochs), desc="Epoch"): for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0])): if n_gpu == 1: batch = tuple(t.to(device) for t in batch) # multi-gpu does scattering it-self input_ids, input_mask, segment_ids, start_positions, end_positions = batch loss = model(input_ids, segment_ids, input_mask, start_positions, end_positions) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: optimizer.backward(loss) else: loss.backward() if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16: # modify learning rate with special warm up BERT uses # if args.fp16 is False, BertAdam is used and handles this automatically lr_this_step = args.learning_rate * warmup_linear.get_lr(global_step, args.warmup_proportion) for param_group in optimizer.param_groups: param_group['lr'] = lr_this_step optimizer.step() optimizer.zero_grad() global_step += 1 if args.local_rank in [-1, 0]: tb_writer.add_scalar('lr', optimizer.get_lr()[0], global_step) tb_writer.add_scalar('loss', loss.item(), global_step) if args.do_train and (args.local_rank == -1 or torch.distributed.get_rank() == 0): # Save a trained model, configuration and tokenizer model_to_save = model.module if hasattr(model, 'module') else model # Only save the model it-self # If we save using the predefined names, we can load using `from_pretrained` output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME) output_config_file = os.path.join(args.output_dir, CONFIG_NAME) torch.save(model_to_save.state_dict(), output_model_file) model_to_save.config.to_json_file(output_config_file) tokenizer.save_vocabulary(args.output_dir) # Load a trained model and vocabulary that you have fine-tuned model = BertForQuestionAnswering.from_pretrained(args.output_dir) tokenizer = BertTokenizer.from_pretrained(args.output_dir, do_lower_case=args.do_lower_case) # Good practice: save your training arguments together with the trained model output_args_file = os.path.join(args.output_dir, 'training_args.bin') torch.save(args, output_args_file) else: model = BertForQuestionAnswering.from_pretrained(args.bert_model) model.to(device) if args.do_predict and (args.local_rank == -1 or torch.distributed.get_rank() == 0): eval_examples = read_squad_examples( input_file=args.predict_file, is_training=False, version_2_with_negative=args.version_2_with_negative) eval_features = convert_examples_to_features( examples=eval_examples, tokenizer=tokenizer, max_seq_length=args.max_seq_length, doc_stride=args.doc_stride, max_query_length=args.max_query_length, is_training=False) logger.info("***** Running predictions *****") logger.info(" Num orig examples = %d", len(eval_examples)) logger.info(" Num split examples = %d", len(eval_features)) logger.info(" Batch size = %d", args.predict_batch_size) all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long) all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_example_index) # Run prediction for full data eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.predict_batch_size) model.eval() all_results = [] logger.info("Start evaluating") for input_ids, input_mask, segment_ids, example_indices in tqdm(eval_dataloader, desc="Evaluating", disable=args.local_rank not in [-1, 0]): if len(all_results) % 1000 == 0: logger.info("Processing example: %d" % (len(all_results))) input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) with torch.no_grad(): batch_start_logits, batch_end_logits = model(input_ids, segment_ids, input_mask) for i, example_index in enumerate(example_indices): start_logits = batch_start_logits[i].detach().cpu().tolist() end_logits = batch_end_logits[i].detach().cpu().tolist() eval_feature = eval_features[example_index.item()] unique_id = int(eval_feature.unique_id) all_results.append(RawResult(unique_id=unique_id, start_logits=start_logits, end_logits=end_logits)) output_prediction_file = os.path.join(args.output_dir, "predictions.json") output_nbest_file = os.path.join(args.output_dir, "nbest_predictions.json") output_null_log_odds_file = os.path.join(args.output_dir, "null_odds.json") write_predictions(eval_examples, eval_features, all_results, args.n_best_size, args.max_answer_length, args.do_lower_case, output_prediction_file, output_nbest_file, output_null_log_odds_file, args.verbose_logging, args.version_2_with_negative, args.null_score_diff_threshold)
def main(args): device = torch.device( "cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() logger.info("device: {}, n_gpu: {}, 16-bits training: {}".format( device, n_gpu, args.fp16)) random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if args.gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(args.gradient_accumulation_steps)) args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps if not args.do_train and not args.do_eval: raise ValueError( "At least one of `do_train` or `do_eval` must be True.") if args.do_train: assert (args.train_file is not None) and (args.dev_file is not None) if args.eval_test: assert args.test_file is not None else: assert args.dev_file is not None if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) if args.do_train: logger.addHandler( logging.FileHandler(os.path.join(args.output_dir, "train.log"), 'w')) else: logger.addHandler( logging.FileHandler(os.path.join(args.output_dir, "eval.log"), 'w')) logger.info(args) tokenizer = BertTokenizer.from_pretrained(args.model, do_lower_case=args.do_lower_case) # read query templates query_templates = read_query_templates(normal_file=args.normal_file, des_file=args.des_file) if args.do_train or (not args.eval_test): eval_examples = read_ace_examples(input_file=args.dev_file, is_training=False) gold_examples = read_ace_examples(input_file=args.gold_file, is_training=False) eval_features = convert_examples_to_features( examples=eval_examples, tokenizer=tokenizer, query_templates=query_templates, nth_query=args.nth_query, is_training=False) logger.info("***** Dev *****") logger.info(" Num orig examples = %d", len(eval_examples)) logger.info(" Num split examples = %d", len(eval_features)) logger.info(" Batch size = %d", args.eval_batch_size) all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long) all_if_trigger_ids = torch.tensor( [f.if_trigger_ids for f in eval_features], dtype=torch.long) all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_if_trigger_ids, all_example_index) eval_dataloader = DataLoader(eval_data, batch_size=args.eval_batch_size) if args.do_train: train_examples = read_ace_examples(input_file=args.train_file, is_training=True) train_features = convert_examples_to_features( examples=train_examples, tokenizer=tokenizer, query_templates=query_templates, nth_query=args.nth_query, is_training=True) if args.train_mode == 'sorted' or args.train_mode == 'random_sorted': train_features = sorted(train_features, key=lambda f: np.sum(f.input_mask)) else: random.shuffle(train_features) all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long) all_if_trigger_ids = torch.tensor( [f.if_trigger_ids for f in train_features], dtype=torch.long) all_start_positions = torch.tensor( [f.start_position for f in train_features], dtype=torch.long) all_end_positions = torch.tensor( [f.end_position for f in train_features], dtype=torch.long) train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_if_trigger_ids, all_start_positions, all_end_positions) train_dataloader = DataLoader(train_data, batch_size=args.train_batch_size) train_batches = [batch for batch in train_dataloader] num_train_optimization_steps = \ len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs logger.info("***** Train *****") logger.info(" Num orig examples = %d", len(train_examples)) logger.info(" Num split examples = %d", len(train_features)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_optimization_steps) eval_step = max(1, len(train_batches) // args.eval_per_epoch) best_result = None lrs = [args.learning_rate] if args.learning_rate else \ [1e-6, 2e-6, 3e-6, 5e-6, 1e-5, 2e-5, 3e-5, 5e-5] for lr in lrs: if not args.add_if_trigger_embedding: model = BertForQuestionAnswering.from_pretrained( args.model, cache_dir=PYTORCH_PRETRAINED_BERT_CACHE) else: model = BertForQuestionAnswering_withIfTriggerEmbedding.from_pretrained( args.model, cache_dir=PYTORCH_PRETRAINED_BERT_CACHE) if args.fp16: model.half() model.to(device) if n_gpu > 1: model = torch.nn.DataParallel(model) param_optimizer = list(model.named_parameters()) param_optimizer = [ n for n in param_optimizer if 'pooler' not in n[0] ] no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) ], 'weight_decay': 0.01 }, { 'params': [ p for n, p in param_optimizer if any(nd in n for nd in no_decay) ], 'weight_decay': 0.0 }] optimizer = BertAdam(optimizer_grouped_parameters, lr=lr, warmup=args.warmup_proportion, t_total=num_train_optimization_steps) tr_loss = 0 nb_tr_examples = 0 nb_tr_steps = 0 global_step = 0 start_time = time.time() for epoch in range(int(args.num_train_epochs)): model.train() logger.info("Start epoch #{} (lr = {})...".format(epoch, lr)) if args.train_mode == 'random' or args.train_mode == 'random_sorted': random.shuffle(train_batches) for step, batch in enumerate(train_batches): if n_gpu == 1: batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, if_trigger_ids, start_positions, end_positions = batch if not args.add_if_trigger_embedding: loss = model(input_ids, segment_ids, input_mask, start_positions, end_positions) else: loss = model(input_ids, segment_ids, if_trigger_ids, input_mask, start_positions, end_positions) if n_gpu > 1: loss = loss.mean() if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps tr_loss += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 loss.backward() if (step + 1) % args.gradient_accumulation_steps == 0: optimizer.step() optimizer.zero_grad() global_step += 1 if (step + 1) % eval_step == 0 or step == 0: save_model = False if args.do_eval: # result, _, _ = evaluate(args, model, device, eval_dataset, eval_dataloader, eval_examples, eval_features) result, preds = evaluate(args, model, device, eval_dataloader, eval_examples, gold_examples, eval_features) # import ipdb; ipdb.set_trace() model.train() result['global_step'] = global_step result['epoch'] = epoch result['learning_rate'] = lr result['batch_size'] = args.train_batch_size if (best_result is None) or (result[args.eval_metric] > best_result[args.eval_metric]): best_result = result save_model = True logger.info( 'Epoch: {}, Step: {} / {}, used_time = {:.2f}s, loss = {:.6f}' .format(epoch, step + 1, len(train_batches), time.time() - start_time, tr_loss / nb_tr_steps)) logger.info( "!!! Best dev %s (lr=%s, epoch=%d): p_c: %.2f, r_c: %.2f, f1_c: %.2f, p_i: %.2f, r_i: %.2f, f1_i: %.2f, best_na_thresh: %.5f" % # logger.info("!!! Best dev %s (lr=%s, epoch=%d): p_c: %.2f, r_c: %.2f, f1_c: %.2f, best_na_thresh: %.10f" % # (args.eval_metric, str(lr), epoch, result["prec_c"], result["recall_c"], result["f1_c"], result["best_na_thresh"])) (args.eval_metric, str(lr), epoch, result["prec_c"], result["recall_c"], result["f1_c"], result["prec_i"], result["recall_i"], result["f1_i"], result["best_na_thresh"])) else: save_model = True if (int(args.num_train_epochs) - epoch < 3 and (step + 1) / len(train_batches) > 0.7 ) or step == 0: save_model = True else: save_model = False if save_model: model_to_save = model.module if hasattr( model, 'module') else model subdir = os.path.join( args.output_dir, "epoch{epoch}-step{step}".format(epoch=epoch, step=step)) if not os.path.exists(subdir): os.makedirs(subdir) output_model_file = os.path.join( subdir, WEIGHTS_NAME) output_config_file = os.path.join( subdir, CONFIG_NAME) torch.save(model_to_save.state_dict(), output_model_file) model_to_save.config.to_json_file( output_config_file) tokenizer.save_vocabulary(subdir) if best_result: with open( os.path.join(args.output_dir, "eval_results.txt"), "w") as writer: for key in sorted(best_result.keys()): writer.write( "%s = %s\n" % (key, str(best_result[key]))) if args.do_eval: if args.eval_test: eval_examples = read_ace_examples(input_file=args.test_file, is_training=False) gold_examples = read_ace_examples(input_file=args.gold_file, is_training=False) eval_features = convert_examples_to_features( examples=eval_examples, tokenizer=tokenizer, query_templates=query_templates, nth_query=args.nth_query, is_training=False) logger.info("***** Test *****") logger.info(" Num orig examples = %d", len(eval_examples)) logger.info(" Num split examples = %d", len(eval_features)) logger.info(" Batch size = %d", args.eval_batch_size) all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long) all_input_mask = torch.tensor( [f.input_mask for f in eval_features], dtype=torch.long) all_segment_ids = torch.tensor( [f.segment_ids for f in eval_features], dtype=torch.long) all_if_trigger_ids = torch.tensor( [f.if_trigger_ids for f in eval_features], dtype=torch.long) all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_if_trigger_ids, all_example_index) eval_dataloader = DataLoader(eval_data, batch_size=args.eval_batch_size) if not args.add_if_trigger_embedding: model = BertForQuestionAnswering.from_pretrained(args.model_dir) else: model = BertForQuestionAnswering_withIfTriggerEmbedding.from_pretrained( args.model_dir) if args.fp16: model.half() model.to(device) result, preds = evaluate(args, model, device, eval_dataloader, eval_examples, gold_examples, eval_features, pred_only=True) with open(os.path.join(args.model_dir, "test_results.txt"), "w") as writer: for key in result: writer.write("%s = %s\n" % (key, str(result[key]))) with open(os.path.join(args.model_dir, "arg_predictions.json"), "w") as writer: for key in preds: writer.write(json.dumps(preds[key], default=int) + "\n")
def BertSquad(file="", mode='predict', bert_model="bert-base-uncased", output='./output'): parser = {} parser["bert_model"] = bert_model parser["output_dir"] = output parser["train_file"] = file parser["predict_file"] = file parser["max_seq_length"] = 384 parser["doc_stride"] = 128 parser["max_query_length"] = 64 parser["do_train"] = mode == 'train' parser["do_predict"] = mode == 'predict' parser["train_batch_size"] = 32 parser["predict_batch_size"] = 8 parser["learning_rate"] = 5e-5 parser["num_train_epochs"] = 3.0 parser["warmup_proportion"] = 0.1 parser["n_best_size"] = 20 parser["max_answer_length"] = 30 parser["verbose_logging"] = False parser["no_cuda"] = False parser['seed'] = 42 parser['gradient_accumulation_steps'] = 1 parser["do_lower_case"] = ('uncased' in bert_model) parser["local_rank"] = -1 parser['fp16'] = False parser['overwrite_output_dir'] = False parser['loss_scale'] = 0 parser['version_2_with_negative'] = False parser['null_score_diff_threshold'] = 0.0 parser['server_ip'] = '' parser['server_port'] = '' args = AttrDict.AttrDict(parser) print(args) if args.server_ip and args.server_port: # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script import ptvsd print("Waiting for debugger attach") ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True) ptvsd.wait_for_attach() if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logging.basicConfig( format='%(asctime)s - %(levelname)s - %(name)s - %(message)s', datefmt='%m/%d/%Y %H:%M:%S', level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN) logger.info( "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}". format(device, n_gpu, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(args.gradient_accumulation_steps)) args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if not args.do_train and not args.do_predict: raise ValueError( "At least one of `do_train` or `do_predict` must be True.") if args.do_train: if not args.train_file: raise ValueError( "If `do_train` is True, then `train_file` must be specified.") if args.do_predict: if not args.predict_file: raise ValueError( "If `do_predict` is True, then `predict_file` must be specified." ) if os.path.exists(args.output_dir) and os.listdir( args.output_dir ) and args.do_train and not args.overwrite_output_dir: raise ValueError( "Output directory () already exists and is not empty.") if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) if args.local_rank not in [-1, 0]: torch.distributed.barrier( ) # Make sure only the first process in distributed training will download model & vocab tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) model = BertForQuestionAnswering.from_pretrained(args.bert_model) if args.local_rank == 0: torch.distributed.barrier() if args.fp16: model.half() model.to(device) if args.local_rank != -1: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True) elif n_gpu > 1: model = torch.nn.DataParallel(model) if args.do_train: if args.local_rank in [-1, 0]: tb_writer = SummaryWriter() # Prepare data loader train_examples = read_squad_examples( input_file=args.train_file, is_training=True, version_2_with_negative=args.version_2_with_negative) cached_train_features_file = args.train_file + '_{0}_{1}_{2}_{3}'.format( list(filter(None, args.bert_model.split('/'))).pop(), str(args.max_seq_length), str(args.doc_stride), str(args.max_query_length)) try: with open(cached_train_features_file, "rb") as reader: train_features = pickle.load(reader) except: train_features = convert_examples_to_features( examples=train_examples, tokenizer=tokenizer, max_seq_length=args.max_seq_length, doc_stride=args.doc_stride, max_query_length=args.max_query_length, is_training=True) if args.local_rank == -1 or torch.distributed.get_rank() == 0: logger.info(" Saving train features into cached file %s", cached_train_features_file) with open(cached_train_features_file, "wb") as writer: pickle.dump(train_features, writer) all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long) all_start_positions = torch.tensor( [f.start_position for f in train_features], dtype=torch.long) all_end_positions = torch.tensor( [f.end_position for f in train_features], dtype=torch.long) train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_start_positions, all_end_positions) if args.local_rank == -1: train_sampler = RandomSampler(train_data) else: train_sampler = DistributedSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) num_train_optimization_steps = len( train_dataloader ) // args.gradient_accumulation_steps * args.num_train_epochs # if args.local_rank != -1: # num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size() # Prepare optimizer param_optimizer = list(model.named_parameters()) # hack to remove pooler, which is not used # thus it produce None grad that break apex param_optimizer = [n for n in param_optimizer if 'pooler' not in n[0]] no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) ], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] if args.fp16: try: from apex.optimizers import FP16_Optimizer from apex.optimizers import FusedAdam except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) optimizer = FusedAdam(optimizer_grouped_parameters, lr=args.learning_rate, bias_correction=False, max_grad_norm=1.0) if args.loss_scale == 0: optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) else: optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale) warmup_linear = WarmupLinearSchedule( warmup=args.warmup_proportion, t_total=num_train_optimization_steps) else: optimizer = BertAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=num_train_optimization_steps) global_step = 0 logger.info("***** Running training *****") logger.info(" Num orig examples = %d", len(train_examples)) logger.info(" Num split examples = %d", len(train_features)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_optimization_steps) model.train() for epoch in trange(int(args.num_train_epochs), desc="Epoch"): for step, batch in enumerate( tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0])): if n_gpu == 1: batch = tuple( t.to(device) for t in batch) # multi-gpu does scattering it-self input_ids, input_mask, segment_ids, start_positions, end_positions = batch loss = model(input_ids, segment_ids, input_mask, start_positions, end_positions) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: optimizer.backward(loss) else: loss.backward() if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16: # modify learning rate with special warm up BERT uses # if args.fp16 is False, BertAdam is used and handles this automatically lr_this_step = args.learning_rate * warmup_linear.get_lr( global_step, args.warmup_proportion) for param_group in optimizer.param_groups: param_group['lr'] = lr_this_step optimizer.step() optimizer.zero_grad() global_step += 1 if args.local_rank in [-1, 0]: tb_writer.add_scalar('lr', optimizer.get_lr()[0], global_step) tb_writer.add_scalar('loss', loss.item(), global_step) if args.do_train and (args.local_rank == -1 or torch.distributed.get_rank() == 0): # Save a trained model, configuration and tokenizer model_to_save = model.module if hasattr( model, 'module') else model # Only save the model it-self # If we save using the predefined names, we can load using `from_pretrained` output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME) output_config_file = os.path.join(args.output_dir, CONFIG_NAME) torch.save(model_to_save.state_dict(), output_model_file) model_to_save.config.to_json_file(output_config_file) tokenizer.save_vocabulary(args.output_dir) # Load a trained model and vocabulary that you have fine-tuned model = BertForQuestionAnswering.from_pretrained(args.output_dir) tokenizer = BertTokenizer.from_pretrained( args.output_dir, do_lower_case=args.do_lower_case) # Good practice: save your training arguments together with the trained model output_args_file = os.path.join(args.output_dir, 'training_args.bin') torch.save(args, output_args_file) else: # Load a trained model and vocabulary that you have fine-tuned model = BertForQuestionAnswering.from_pretrained(args.output_dir) tokenizer = BertTokenizer.from_pretrained( args.output_dir, do_lower_case=args.do_lower_case) model.to(device) if args.do_predict and (args.local_rank == -1 or torch.distributed.get_rank() == 0): eval_examples = read_squad_examples( input_file=args.predict_file, is_training=False, version_2_with_negative=args.version_2_with_negative) eval_features = convert_examples_to_features( examples=eval_examples, tokenizer=tokenizer, max_seq_length=args.max_seq_length, doc_stride=args.doc_stride, max_query_length=args.max_query_length, is_training=False) logger.info("***** Running predictions *****") logger.info(" Num orig examples = %d", len(eval_examples)) logger.info(" Num split examples = %d", len(eval_features)) logger.info(" Batch size = %d", args.predict_batch_size) all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long) all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_example_index) # Run prediction for full data eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.predict_batch_size) model.eval() all_results = [] logger.info("Start evaluating") for input_ids, input_mask, segment_ids, example_indices in tqdm( eval_dataloader, desc="Evaluating", disable=args.local_rank not in [-1, 0]): if len(all_results) % 1000 == 0: logger.info("Processing example: %d" % (len(all_results))) input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) with torch.no_grad(): batch_start_logits, batch_end_logits = model( input_ids, segment_ids, input_mask) for i, example_index in enumerate(example_indices): start_logits = batch_start_logits[i].detach().cpu().tolist() end_logits = batch_end_logits[i].detach().cpu().tolist() eval_feature = eval_features[example_index.item()] unique_id = int(eval_feature.unique_id) all_results.append( RawResult(unique_id=unique_id, start_logits=start_logits, end_logits=end_logits)) output_prediction_file = os.path.join(args.output_dir, "predictions.json") output_nbest_file = os.path.join(args.output_dir, "nbest_predictions.json") output_null_log_odds_file = os.path.join(args.output_dir, "null_odds.json") write_predictions(eval_examples, eval_features, all_results, args.n_best_size, args.max_answer_length, args.do_lower_case, output_prediction_file, output_nbest_file, output_null_log_odds_file, args.verbose_logging, args.version_2_with_negative, args.null_score_diff_threshold)
def main(): parser = argparse.ArgumentParser() parser.add_argument("--paragraph", default=None, type=str) parser.add_argument("--question", default=None, type=str) parser.add_argument("--model", default=None, type=str) parser.add_argument("--max_seq_length", default=384, type=int) parser.add_argument("--doc_stride", default=128, type=int) parser.add_argument("--max_query_length", default=64, type=int) parser.add_argument("--config_file", default=None, type=str) parser.add_argument("--max_answer_length", default=30, type=int) args = parser.parse_args() para_file = args.paragraph question_file = args.question model_path = args.model device = torch.device("cpu") ### Raeding paragraph # f = open(para_file, 'r') # para = f.read() # f.close() ## Reading question # f = open(ques_file, 'r') # ques = f.read() # f.close() # para_list = para.split('\n\n') f = open(para_file, "rb") para = f.read() para = para.decode('windows-1252') para = para.strip("\n").replace("\r", " ").replace("\n", "") #print(para) # print(para) f.close() f_ = open(question_file, "r") question = f_.read() question = question.split("\n") while "" in question: question.remove("") for q in question: q = q.strip("\n") f_.close() input_data = [] pfinder = ParaFinder(para) i = 0 for q in question: closest_para = pfinder.closestParagraph(q) paragraphs = {} paragraphs["id"] = i paragraphs["text"] = closest_para paragraphs["ques"] = [q] i += 1 input_data.append(paragraphs) # print(input_data) ## input_data is a list of dictionary which has a paragraph and questions examples = read_squad_examples(input_data) tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True) eval_features = convert_examples_to_features( examples=examples, tokenizer=tokenizer, max_seq_length=args.max_seq_length, doc_stride=args.doc_stride, max_query_length=args.max_query_length) all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long) all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long) ### Loading Pretrained model for QnA config = BertConfig(args.config_file) model = BertForQuestionAnswering(config) model.load_state_dict(torch.load(model_path, map_location=device)) model.to(device) pred_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_example_index) # Run prediction for full data pred_sampler = SequentialSampler(pred_data) pred_dataloader = DataLoader(pred_data, sampler=pred_sampler, batch_size=9) predictions = [] for input_ids, input_mask, segment_ids, example_indices in pred_dataloader: input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) with torch.no_grad(): batch_start_logits, batch_end_logits = model( input_ids, segment_ids, input_mask) features = [] example = [] all_results = [] for i, example_index in enumerate(example_indices): start_logits = batch_start_logits[i].detach().cpu().tolist() end_logits = batch_end_logits[i].detach().cpu().tolist() feature = eval_features[example_index.item()] unique_id = int(feature.unique_id) features.append(feature) all_results.append( RawResult(unique_id=unique_id, start_logits=start_logits, end_logits=end_logits)) output = predict(examples, features, all_results, args.max_answer_length) predictions.append(output) ### For printing the results #### index = None for example in examples: if index != example.example_id: # print(example.para_text) index = example.example_id # print('\n') # print(colored('***********Question and Answers *************', 'red')) ques_text = example.question_text print(ques_text) prediction, prob = predictions[math.floor(example.unique_id / 12)][example] if prob > 0.35: print(prediction) #print(type(prediction)) else: print("No result found")
def load(self): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument( "--bert_model", default=None, type=str, required=True, help= "Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese." ) parser.add_argument( "--output_dir", default=None, type=str, required=True, help= "The output directory where the model checkpoints will be written." ) ## Other parameters parser.add_argument( "--train_file", default=None, type=str, help="SQuAD json for training. E.g., train-v1.1.json") parser.add_argument( "--predict_file", default=None, type=str, help= "SQuAD json for predictions. E.g., dev-v1.1.json or test-v1.1.json" ) parser.add_argument( "--max_seq_length", default=384, type=int, help= "The maximum total input sequence length after WordPiece tokenization. Sequences " "longer than this will be truncated, and sequences shorter than this will be padded." ) parser.add_argument( "--doc_stride", default=128, type=int, help= "When splitting up a long document into chunks, how much stride to take between chunks." ) parser.add_argument( "--max_query_length", default=64, type=int, help= "The maximum number of tokens for the question. Questions longer than this will " "be truncated to this length.") parser.add_argument('--do_lower_case', action="store_true", default=False, help="Lowercase the input") parser.add_argument("--do_train", default=False, action='store_true', help="Whether to run training.") parser.add_argument("--do_predict", default=False, action='store_true', help="Whether to run eval on the dev set.") parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.") parser.add_argument("--predict_batch_size", default=8, type=int, help="Total batch size for predictions.") parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument( "--warmup_proportion", default=0.1, type=float, help= "Proportion of training to perform linear learning rate warmup for. E.g., 0.1 = 10% " "of training.") parser.add_argument( "--n_best_size", default=20, type=int, help= "The total number of n-best predictions to generate in the nbest_predictions.json " "output file.") parser.add_argument( "--max_answer_length", default=30, type=int, help= "The maximum length of an answer that can be generated. This is needed because the start " "and end predictions are not conditioned on one another.") parser.add_argument( "--verbose_logging", default=False, action='store_true', help= "If true, all of the warnings related to data processing will be printed. " "A number of warnings are expected for a normal SQuAD evaluation.") parser.add_argument("--no_cuda", default=False, action='store_true', help="Whether not to use CUDA when available") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument( '--gradient_accumulation_steps', type=int, default=1, help= "Number of updates steps to accumulate before performing a backward/update pass." ) parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument( '--optimize_on_cpu', default=False, action='store_true', help= "Whether to perform optimization and keep the optimizer averages on CPU" ) parser.add_argument( '--fp16', default=False, action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument( '--loss_scale', type=float, default=128, help= 'Loss scaling, positive power of 2 values can improve fp16 convergence.' ) self.args = parser.parse_args([ '--bert_model=bert-base-cased', '--output_dir=./output', '--do_predict', '--predict_file=./squad/dev-v1.1.json' ]) if self.args.local_rank == -1 or self.args.no_cuda: self.device = torch.device("cuda" if torch.cuda.is_available() and not self.args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: self.device = torch.device("cuda", self.args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') if self.args.fp16: logger.info( "16-bits training currently not supported in distributed training" ) self.args.fp16 = False # (see https://github.com/pytorch/pytorch/pull/13496) logger.info( "device: {} n_gpu: {}, distributed training: {}, 16-bits trainiing: {}" .format(self.device, n_gpu, bool(self.args.local_rank != -1), self.args.fp16)) if self.args.gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(self.args.gradient_accumulation_steps)) self.args.train_batch_size = int(self.args.train_batch_size / self.args.gradient_accumulation_steps) random.seed(self.args.seed) np.random.seed(self.args.seed) torch.manual_seed(self.args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(self.args.seed) if not self.args.do_train and not self.args.do_predict: raise ValueError( "At least one of `do_train` or `do_predict` must be True.") if self.args.do_train: if not self.args.train_file: raise ValueError( "If `do_train` is True, then `train_file` must be specified." ) if self.args.do_predict: if not self.args.predict_file: raise ValueError( "If `do_predict` is True, then `predict_file` must be specified." ) if os.path.exists(self.args.output_dir) and os.listdir( self.args.output_dir): raise ValueError( "Output directory () already exists and is not empty.") os.makedirs(self.args.output_dir, exist_ok=True) self.tokenizer = BertTokenizer.from_pretrained(tokenizer_path) train_examples = None num_train_steps = None if self.args.do_train: train_examples = read_squad_examples( input_file=self.args.train_file, is_training=True) num_train_steps = int( len(train_examples) / self.args.train_batch_size / self.args.gradient_accumulation_steps * self.args.num_train_epochs) # Prepare model self.model = BertForQuestionAnswering.from_pretrained( pretrained_path, cache_dir=PYTORCH_PRETRAINED_BERT_CACHE / 'distributed_{}'.format(self.args.local_rank)) self.model.bert.load_state_dict( torch.load(finetuned_path, map_location='cpu')) if self.args.fp16: self.model.half() self.model.to(self.device) if self.args.local_rank != -1: self.model = torch.nn.parallel.DistributedDataParallel( self.model, device_ids=[self.args.local_rank], output_device=self.args.local_rank) elif n_gpu > 1: self.model = torch.nn.DataParallel(model) self.model.eval()
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument("--bert_token_model", default=None, type=str, required=True, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, " "bert-base-multilingual-cased, bert-base-chinese.") parser.add_argument("--model_dir", default=None, type=str, required=True, help="학습된 모델이 저장되어 있는 path") parser.add_argument("--output_dir", default=None, type=str, required=True, help="The output directory where the model checkpoints and predictions will be written.") ## Other parameters parser.add_argument("--predict_file", default=None, type=str, help="SQuAD json for predictions. E.g., dev-v1.1.json or test-v1.1.json") parser.add_argument("--max_seq_length", default=384, type=int, help="The maximum total input sequence length after WordPiece tokenization. Sequences " "longer than this will be truncated, and sequences shorter than this will be padded.") parser.add_argument("--doc_stride", default=128, type=int, help="When splitting up a long document into chunks, how much stride to take between chunks.") parser.add_argument("--max_query_length", default=64, type=int, help="The maximum number of tokens for the question. Questions longer than this will " "be truncated to this length.") parser.add_argument("--do_predict", action='store_true', help="Whether to run eval on the dev set.") parser.add_argument("--predict_batch_size", default=8, type=int, help="Total batch size for predictions.") parser.add_argument("--n_best_size", default=20, type=int, help="The total number of n-best predictions to generate in the nbest_predictions.json " "output file.") parser.add_argument("--max_answer_length", default=30, type=int, help="The maximum length of an answer that can be generated. This is needed because the start " "and end predictions are not conditioned on one another.") parser.add_argument("--verbose_logging", action='store_true', help="If true, all of the warnings related to data processing will be printed. " "A number of warnings are expected for a normal SQuAD evaluation.") parser.add_argument("--no_cuda", action='store_true', help="Whether not to use CUDA when available") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument("--do_lower_case", action='store_true', help="Whether to lower case the input text. True for uncased models, False for cased models.") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument('--fp16', action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument('--version_2_with_negative', action='store_true', help='If true, the SQuAD examples contain some that do not have an answer.') parser.add_argument('--null_score_diff_threshold', type=float, default=0.0, help="If null_score - best_non_null is greater than the threshold predict null.") parser.add_argument('--server_ip', type=str, default='', help="Can be used for distant debugging.") parser.add_argument('--server_port', type=str, default='', help="Can be used for distant debugging.") args = parser.parse_args() #python run_triviaqa.py --bert_token_model bert-base-uncased --model_dir bert_triviaQA/ --output_dir result/ --predict_file dev-wiki-triviaqa_m.json --no_cuda --do_lower_case --predict_batch_size 40 print(args) if args.server_ip and args.server_port: # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script import ptvsd print("Waiting for debugger attach") ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True) ptvsd.wait_for_attach() if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logging.basicConfig(format='%(asctime)s - %(levelname)s - %(name)s - %(message)s', datefmt='%m/%d/%Y %H:%M:%S', level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN) logger.info("device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".format( device, n_gpu, bool(args.local_rank != -1), args.fp16)) random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if args.do_predict: if not args.predict_file: raise ValueError( "If `do_predict` is True, then `predict_file` must be specified.") if os.path.exists(args.output_dir) and os.listdir(args.output_dir): raise ValueError("Output directory () already exists and is not empty.") if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) tokenizer = BertTokenizer.from_pretrained(args.bert_token_model, do_lower_case=args.do_lower_case) # Load Pretrained Model config_path = os.path.join(args.model_dir, CONFIG_NAME) model_path = os.path.join(args.model_dir, WEIGHTS_NAME) config = BertConfig(config_path) model = BertForQuestionAnswering(config) model.load_state_dict(torch.load(model_path, map_location='cpu')) if args.fp16: model.half() model.to(device) if args.local_rank != -1: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.") model = DDP(model) elif n_gpu > 1: model = torch.nn.DataParallel(model) if args.do_predict and (args.local_rank == -1 or torch.distributed.get_rank() == 0): eval_examples = read_squad_examples( input_file=args.predict_file, is_training=False, version_2_with_negative=args.version_2_with_negative) eval_features = convert_examples_to_features( examples=eval_examples, tokenizer=tokenizer, max_seq_length=args.max_seq_length, doc_stride=args.doc_stride, max_query_length=args.max_query_length, is_training=False) logger.info("***** Running predictions *****") logger.info(" Num orig examples = %d", len(eval_examples)) logger.info(" Num split examples = %d", len(eval_features)) logger.info(" Batch size = %d", args.predict_batch_size) all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long) all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_example_index) # Run prediction for full data eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.predict_batch_size) model.eval() all_results = [] logger.info("Start evaluating") for input_ids, input_mask, segment_ids, example_indices in tqdm(eval_dataloader, desc="Evaluating", disable=args.local_rank not in [-1, 0]): if len(all_results) % 1000 == 0: logger.info("Processing example: %d" % (len(all_results))) input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) with torch.no_grad(): batch_start_logits, batch_end_logits = model(input_ids, segment_ids, input_mask) for i, example_index in enumerate(example_indices): start_logits = batch_start_logits[i].detach().cpu().tolist() end_logits = batch_end_logits[i].detach().cpu().tolist() eval_feature = eval_features[example_index.item()] unique_id = int(eval_feature.unique_id) all_results.append(RawResult(unique_id=unique_id, start_logits=start_logits, end_logits=end_logits)) output_prediction_file = os.path.join(args.output_dir, "predictions.json") output_nbest_file = os.path.join(args.output_dir, "nbest_predictions.json") output_null_log_odds_file = os.path.join(args.output_dir, "null_odds.json") write_predictions(eval_examples, eval_features, all_results, args.n_best_size, args.max_answer_length, args.do_lower_case, output_prediction_file, output_nbest_file, output_null_log_odds_file, args.verbose_logging, args.version_2_with_negative, args.null_score_diff_threshold)
def __init__(self): # Hyperparameters self.BERT_MODEL = "bert-base-uncased" self.OUTPUT_DIR = "bert-model" self.TRAIN_FILE = "" self.PREDICT_FILE = "squad/test-pred.json" self.MAX_SEQ_LENGTH = 384 self.DOC_STRIDE = 128 self.MAX_QUERY_LENGTH = 64 self.DO_TRAIN = False self.DO_PREDICT = True self.TRAIN_BATCH_SIZE = 12 self.PREDICT_BATCH_SIZE = 8 self.LEARNING_RATE = 3e-5 self.NUM_TRAIN_EPOCHS = 2.0 self.WARMUP_PROPORTION = 0.1 self.N_BEST_SIZE = 20 self.MAX_ANSWER_LENGTH = 30 self.VERBOSE_LOGGING = False self.NO_CUDA = False self.SEED = 42 self.GRADIENT_ACCUMULATION_STEPS = 1 self.DO_LOWER_CASE = True self.LOCAL_RANK = -1 self.FP16 = False self.LOSS_SCALE = 0 self.VERSION_2_WITH_NEGATIVE = True self.NULL_SCORE_DIFF_THRESHOLD = 0.0 if self.LOCAL_RANK == -1 or self.NO_CUDA: self.device = torch.device("cuda" if torch.cuda.is_available() and not self.NO_CUDA else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(self.LOCAL_RANK) self.device = torch.device("cuda", self.LOCAL_RANK) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logger.info( "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}" .format(self.device, n_gpu, bool(self.LOCAL_RANK != -1), self.FP16)) if self.GRADIENT_ACCUMULATION_STEPS < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(self.GRADIENT_ACCUMULATION_STEPS)) self.TRAIN_BATCH_SIZE = self.TRAIN_BATCH_SIZE // self.GRADIENT_ACCUMULATION_STEPS random.seed(self.SEED) np.random.seed(self.SEED) torch.manual_seed(self.SEED) if n_gpu > 0: torch.cuda.manual_seed_all(self.SEED) if not self.DO_TRAIN and not self.DO_PREDICT: raise ValueError( "At least one of `do_train` or `do_predict` must be True.") if self.DO_TRAIN: if not self.TRAIN_FILE: raise ValueError( "If `do_train` is True, then `train_file` must be specified." ) if self.DO_PREDICT: if not self.PREDICT_FILE: raise ValueError( "If `do_predict` is True, then `predict_file` must be specified." ) if os.path.exists(self.OUTPUT_DIR) and os.listdir( self.OUTPUT_DIR) and self.DO_TRAIN: raise ValueError( "Output directory () already exists and is not empty.") if not os.path.exists(self.OUTPUT_DIR): os.makedirs(self.OUTPUT_DIR) self.tokenizer = BertTokenizer.from_pretrained( self.BERT_MODEL, do_lower_case=self.DO_LOWER_CASE) train_examples = None num_train_optimization_steps = None # Prepare model self.model = BertForQuestionAnswering.from_pretrained( self.BERT_MODEL, cache_dir=os.path.join(str(PYTORCH_PRETRAINED_BERT_CACHE), 'distributed_{}'.format(self.LOCAL_RANK))) if self.FP16: self.model.half() self.model.to(self.device) if self.LOCAL_RANK != -1: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) self.model = DDP(self.model) elif n_gpu > 1: self.model = torch.nn.DataParallel(self.model) # Prepare optimizer param_optimizer = list(self.model.named_parameters()) # hack to remove pooler, which is not used # thus it produce None grad that break apex param_optimizer = [n for n in param_optimizer if 'pooler' not in n[0]] no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) ], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] if self.FP16: try: from apex.optimizers import FP16_Optimizer from apex.optimizers import FusedAdam except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) optimizer = FusedAdam(optimizer_grouped_parameters, lr=self.LEARNING_RATE, bias_correction=False, max_grad_norm=1.0) if self.LOSS_SCALE == 0: optimizer = self.FP16_Optimizer(optimizer, dynamic_loss_scale=True) else: optimizer = self.FP16_Optimizer( optimizer, static_loss_scale=self.LOSS_SCALE) else: optimizer = BertAdam(optimizer_grouped_parameters, lr=self.LEARNING_RATE, warmup=self.WARMUP_PROPORTION, t_total=num_train_optimization_steps) # self.model = BertForQuestionAnswering.from_pretrained(self.BERT_MODEL) output_model_file = os.path.join(self.OUTPUT_DIR, WEIGHTS_NAME) output_config_file = os.path.join(self.OUTPUT_DIR, CONFIG_NAME) # Load a trained model and config that you have fine-tuned config = BertConfig(output_config_file) self.model = BertForQuestionAnswering(config) if torch.cuda.is_available(): self.model.load_state_dict(torch.load(output_model_file)) else: self.model.load_state_dict( torch.load(output_model_file, map_location='cpu')) self.model.to(self.device) print('\n*** QA MODULE READY [1/3] ***\n')
class Bert(object): def __init__(self): # Hyperparameters self.BERT_MODEL = "bert-base-uncased" self.OUTPUT_DIR = "bert-model" self.TRAIN_FILE = "" self.PREDICT_FILE = "squad/test-pred.json" self.MAX_SEQ_LENGTH = 384 self.DOC_STRIDE = 128 self.MAX_QUERY_LENGTH = 64 self.DO_TRAIN = False self.DO_PREDICT = True self.TRAIN_BATCH_SIZE = 12 self.PREDICT_BATCH_SIZE = 8 self.LEARNING_RATE = 3e-5 self.NUM_TRAIN_EPOCHS = 2.0 self.WARMUP_PROPORTION = 0.1 self.N_BEST_SIZE = 20 self.MAX_ANSWER_LENGTH = 30 self.VERBOSE_LOGGING = False self.NO_CUDA = False self.SEED = 42 self.GRADIENT_ACCUMULATION_STEPS = 1 self.DO_LOWER_CASE = True self.LOCAL_RANK = -1 self.FP16 = False self.LOSS_SCALE = 0 self.VERSION_2_WITH_NEGATIVE = True self.NULL_SCORE_DIFF_THRESHOLD = 0.0 if self.LOCAL_RANK == -1 or self.NO_CUDA: self.device = torch.device("cuda" if torch.cuda.is_available() and not self.NO_CUDA else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(self.LOCAL_RANK) self.device = torch.device("cuda", self.LOCAL_RANK) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logger.info( "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}" .format(self.device, n_gpu, bool(self.LOCAL_RANK != -1), self.FP16)) if self.GRADIENT_ACCUMULATION_STEPS < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(self.GRADIENT_ACCUMULATION_STEPS)) self.TRAIN_BATCH_SIZE = self.TRAIN_BATCH_SIZE // self.GRADIENT_ACCUMULATION_STEPS random.seed(self.SEED) np.random.seed(self.SEED) torch.manual_seed(self.SEED) if n_gpu > 0: torch.cuda.manual_seed_all(self.SEED) if not self.DO_TRAIN and not self.DO_PREDICT: raise ValueError( "At least one of `do_train` or `do_predict` must be True.") if self.DO_TRAIN: if not self.TRAIN_FILE: raise ValueError( "If `do_train` is True, then `train_file` must be specified." ) if self.DO_PREDICT: if not self.PREDICT_FILE: raise ValueError( "If `do_predict` is True, then `predict_file` must be specified." ) if os.path.exists(self.OUTPUT_DIR) and os.listdir( self.OUTPUT_DIR) and self.DO_TRAIN: raise ValueError( "Output directory () already exists and is not empty.") if not os.path.exists(self.OUTPUT_DIR): os.makedirs(self.OUTPUT_DIR) self.tokenizer = BertTokenizer.from_pretrained( self.BERT_MODEL, do_lower_case=self.DO_LOWER_CASE) train_examples = None num_train_optimization_steps = None # Prepare model self.model = BertForQuestionAnswering.from_pretrained( self.BERT_MODEL, cache_dir=os.path.join(str(PYTORCH_PRETRAINED_BERT_CACHE), 'distributed_{}'.format(self.LOCAL_RANK))) if self.FP16: self.model.half() self.model.to(self.device) if self.LOCAL_RANK != -1: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) self.model = DDP(self.model) elif n_gpu > 1: self.model = torch.nn.DataParallel(self.model) # Prepare optimizer param_optimizer = list(self.model.named_parameters()) # hack to remove pooler, which is not used # thus it produce None grad that break apex param_optimizer = [n for n in param_optimizer if 'pooler' not in n[0]] no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) ], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] if self.FP16: try: from apex.optimizers import FP16_Optimizer from apex.optimizers import FusedAdam except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) optimizer = FusedAdam(optimizer_grouped_parameters, lr=self.LEARNING_RATE, bias_correction=False, max_grad_norm=1.0) if self.LOSS_SCALE == 0: optimizer = self.FP16_Optimizer(optimizer, dynamic_loss_scale=True) else: optimizer = self.FP16_Optimizer( optimizer, static_loss_scale=self.LOSS_SCALE) else: optimizer = BertAdam(optimizer_grouped_parameters, lr=self.LEARNING_RATE, warmup=self.WARMUP_PROPORTION, t_total=num_train_optimization_steps) # self.model = BertForQuestionAnswering.from_pretrained(self.BERT_MODEL) output_model_file = os.path.join(self.OUTPUT_DIR, WEIGHTS_NAME) output_config_file = os.path.join(self.OUTPUT_DIR, CONFIG_NAME) # Load a trained model and config that you have fine-tuned config = BertConfig(output_config_file) self.model = BertForQuestionAnswering(config) if torch.cuda.is_available(): self.model.load_state_dict(torch.load(output_model_file)) else: self.model.load_state_dict( torch.load(output_model_file, map_location='cpu')) self.model.to(self.device) print('\n*** QA MODULE READY [1/3] ***\n') def read_squad_examples(self, input_file, is_training, version_2_with_negative): """Read a SQuAD json file into a list of SquadExample.""" with open(input_file, "r", encoding='utf-8') as reader: input_data = json.load(reader)["data"] def is_whitespace(c): if c == " " or c == "\t" or c == "\r" or c == "\n" or ord( c) == 0x202F: return True return False examples = [] for entry in input_data: for paragraph in entry["paragraphs"]: paragraph_text = paragraph["context"] doc_tokens = [] char_to_word_offset = [] prev_is_whitespace = True for c in paragraph_text: if is_whitespace(c): prev_is_whitespace = True else: if prev_is_whitespace: doc_tokens.append(c) else: doc_tokens[-1] += c prev_is_whitespace = False char_to_word_offset.append(len(doc_tokens) - 1) for qa in paragraph["qas"]: qas_id = qa["id"] question_text = qa["question"] start_position = None end_position = None orig_answer_text = None is_impossible = False if is_training: if version_2_with_negative: is_impossible = qa["is_impossible"] if (len(qa["answers"]) != 1) and (not is_impossible): raise ValueError( "For training, each question should have exactly 1 answer." ) if not is_impossible: answer = qa["answers"][0] orig_answer_text = answer["text"] answer_offset = answer["answer_start"] answer_length = len(orig_answer_text) start_position = char_to_word_offset[answer_offset] end_position = char_to_word_offset[answer_offset + answer_length - 1] # Only add answers where the text can be exactly recovered from the # document. If this CAN'T happen it's likely due to weird Unicode # stuff so we will just skip the example. # # Note that this means for training mode, every example is NOT # guaranteed to be preserved. actual_text = " ".join( doc_tokens[start_position:(end_position + 1)]) cleaned_answer_text = " ".join( whitespace_tokenize(orig_answer_text)) if actual_text.find(cleaned_answer_text) == -1: logger.warning( "Could not find answer: '%s' vs. '%s'", actual_text, cleaned_answer_text) continue else: start_position = -1 end_position = -1 orig_answer_text = "" example = SquadExample(qas_id=qas_id, question_text=question_text, doc_tokens=doc_tokens, orig_answer_text=orig_answer_text, start_position=start_position, end_position=end_position, is_impossible=is_impossible) examples.append(example) return examples def convert_examples_to_features(self, examples, tokenizer, max_seq_length, doc_stride, max_query_length, is_training): """Loads a data file into a list of `InputBatch`s.""" unique_id = 1000000000 features = [] for (example_index, example) in enumerate(examples): query_tokens = tokenizer.tokenize(example.question_text) if len(query_tokens) > max_query_length: query_tokens = query_tokens[0:max_query_length] tok_to_orig_index = [] orig_to_tok_index = [] all_doc_tokens = [] for (i, token) in enumerate(example.doc_tokens): orig_to_tok_index.append(len(all_doc_tokens)) sub_tokens = tokenizer.tokenize(token) for sub_token in sub_tokens: tok_to_orig_index.append(i) all_doc_tokens.append(sub_token) tok_start_position = None tok_end_position = None if is_training and example.is_impossible: tok_start_position = -1 tok_end_position = -1 if is_training and not example.is_impossible: tok_start_position = orig_to_tok_index[example.start_position] if example.end_position < len(example.doc_tokens) - 1: tok_end_position = orig_to_tok_index[example.end_position + 1] - 1 else: tok_end_position = len(all_doc_tokens) - 1 (tok_start_position, tok_end_position) = self._improve_answer_span( all_doc_tokens, tok_start_position, tok_end_position, tokenizer, example.orig_answer_text) # The -3 accounts for [CLS], [SEP] and [SEP] max_tokens_for_doc = max_seq_length - len(query_tokens) - 3 # We can have documents that are longer than the maximum sequence length. # To deal with this we do a sliding window approach, where we take chunks # of the up to our max length with a stride of `doc_stride`. _DocSpan = collections.namedtuple( # pylint: disable=invalid-name "DocSpan", ["start", "length"]) doc_spans = [] start_offset = 0 while start_offset < len(all_doc_tokens): length = len(all_doc_tokens) - start_offset if length > max_tokens_for_doc: length = max_tokens_for_doc doc_spans.append(_DocSpan(start=start_offset, length=length)) if start_offset + length == len(all_doc_tokens): break start_offset += min(length, doc_stride) for (doc_span_index, doc_span) in enumerate(doc_spans): tokens = [] token_to_orig_map = {} token_is_max_context = {} segment_ids = [] tokens.append("[CLS]") segment_ids.append(0) for token in query_tokens: tokens.append(token) segment_ids.append(0) tokens.append("[SEP]") segment_ids.append(0) for i in range(doc_span.length): split_token_index = doc_span.start + i token_to_orig_map[len( tokens)] = tok_to_orig_index[split_token_index] is_max_context = self._check_is_max_context( doc_spans, doc_span_index, split_token_index) token_is_max_context[len(tokens)] = is_max_context tokens.append(all_doc_tokens[split_token_index]) segment_ids.append(1) tokens.append("[SEP]") segment_ids.append(1) input_ids = tokenizer.convert_tokens_to_ids(tokens) # The mask has 1 for real tokens and 0 for padding tokens. Only real # tokens are attended to. input_mask = [1] * len(input_ids) # Zero-pad up to the sequence length. while len(input_ids) < max_seq_length: input_ids.append(0) input_mask.append(0) segment_ids.append(0) assert len(input_ids) == max_seq_length assert len(input_mask) == max_seq_length assert len(segment_ids) == max_seq_length start_position = None end_position = None if is_training and not example.is_impossible: # For training, if our document chunk does not contain an annotation # we throw it out, since there is nothing to predict. doc_start = doc_span.start doc_end = doc_span.start + doc_span.length - 1 out_of_span = False if not (tok_start_position >= doc_start and tok_end_position <= doc_end): out_of_span = True if out_of_span: start_position = 0 end_position = 0 else: doc_offset = len(query_tokens) + 2 start_position = tok_start_position - doc_start + doc_offset end_position = tok_end_position - doc_start + doc_offset if is_training and example.is_impossible: start_position = 0 end_position = 0 features.append( InputFeatures(unique_id=unique_id, example_index=example_index, doc_span_index=doc_span_index, tokens=tokens, token_to_orig_map=token_to_orig_map, token_is_max_context=token_is_max_context, input_ids=input_ids, input_mask=input_mask, segment_ids=segment_ids, start_position=start_position, end_position=end_position, is_impossible=example.is_impossible)) unique_id += 1 return features def _improve_answer_span(self, doc_tokens, input_start, input_end, tokenizer, orig_answer_text): """Returns tokenized answer spans that better match the annotated answer.""" # The SQuAD annotations are character based. We first project them to # whitespace-tokenized words. But then after WordPiece tokenization, we can # often find a "better match". For example: # # Question: What year was John Smith born? # Context: The leader was John Smith (1895-1943). # Answer: 1895 # # The original whitespace-tokenized answer will be "(1895-1943).". However # after tokenization, our tokens will be "( 1895 - 1943 ) .". So we can match # the exact answer, 1895. # # However, this is not always possible. Consider the following: # # Question: What country is the top exporter of electornics? # Context: The Japanese electronics industry is the lagest in the world. # Answer: Japan # # In this case, the annotator chose "Japan" as a character sub-span of # the word "Japanese". Since our WordPiece tokenizer does not split # "Japanese", we just use "Japanese" as the annotation. This is fairly rare # in SQuAD, but does happen. tok_answer_text = " ".join(tokenizer.tokenize(orig_answer_text)) for new_start in range(input_start, input_end + 1): for new_end in range(input_end, new_start - 1, -1): text_span = " ".join(doc_tokens[new_start:(new_end + 1)]) if text_span == tok_answer_text: return (new_start, new_end) return (input_start, input_end) def _check_is_max_context(self, doc_spans, cur_span_index, position): """Check if this is the 'max context' doc span for the token.""" # Because of the sliding window approach taken to scoring documents, a single # token can appear in multiple documents. E.g. # Doc: the man went to the store and bought a gallon of milk # Span A: the man went to the # Span B: to the store and bought # Span C: and bought a gallon of # ... # # Now the word 'bought' will have two scores from spans B and C. We only # want to consider the score with "maximum context", which we define as # the *minimum* of its left and right context (the *sum* of left and # right context will always be the same, of course). # # In the example the maximum context for 'bought' would be span C since # it has 1 left context and 3 right context, while span B has 4 left context # and 0 right context. best_score = None best_span_index = None for (span_index, doc_span) in enumerate(doc_spans): end = doc_span.start + doc_span.length - 1 if position < doc_span.start: continue if position > end: continue num_left_context = position - doc_span.start num_right_context = end - position score = min(num_left_context, num_right_context) + 0.01 * doc_span.length if best_score is None or score > best_score: best_score = score best_span_index = span_index return cur_span_index == best_span_index RawResult = collections.namedtuple( "RawResult", ["unique_id", "start_logits", "end_logits"]) def write_predictions(self, all_examples, all_features, all_results, n_best_size, max_answer_length, do_lower_case, output_prediction_file, output_nbest_file, output_null_log_odds_file, verbose_logging, version_2_with_negative, null_score_diff_threshold): example_index_to_features = collections.defaultdict(list) for feature in all_features: example_index_to_features[feature.example_index].append(feature) unique_id_to_result = {} for result in all_results: unique_id_to_result[result.unique_id] = result _PrelimPrediction = collections.namedtuple( # pylint: disable=invalid-name "PrelimPrediction", [ "feature_index", "start_index", "end_index", "start_logit", "end_logit" ]) all_predictions = collections.OrderedDict() all_nbest_json = collections.OrderedDict() scores_diff_json = collections.OrderedDict() for (example_index, example) in enumerate(all_examples): features = example_index_to_features[example_index] prelim_predictions = [] # keep track of the minimum score of null start+end of position 0 score_null = 1000000 # large and positive min_null_feature_index = 0 # the paragraph slice with min null score null_start_logit = 0 # the start logit at the slice with min null score null_end_logit = 0 # the end logit at the slice with min null score for (feature_index, feature) in enumerate(features): result = unique_id_to_result[feature.unique_id] start_indexes = self._get_best_indexes(result.start_logits, n_best_size) end_indexes = self._get_best_indexes(result.end_logits, n_best_size) # if we could have irrelevant answers, get the min score of irrelevant if version_2_with_negative: feature_null_score = result.start_logits[ 0] + result.end_logits[0] if feature_null_score < score_null: score_null = feature_null_score min_null_feature_index = feature_index null_start_logit = result.start_logits[0] null_end_logit = result.end_logits[0] for start_index in start_indexes: for end_index in end_indexes: # We could hypothetically create invalid predictions, e.g., predict # that the start of the span is in the question. We throw out all # invalid predictions. if start_index >= len(feature.tokens): continue if end_index >= len(feature.tokens): continue if start_index not in feature.token_to_orig_map: continue if end_index not in feature.token_to_orig_map: continue if not feature.token_is_max_context.get( start_index, False): continue if end_index < start_index: continue length = end_index - start_index + 1 if length > max_answer_length: continue prelim_predictions.append( _PrelimPrediction( feature_index=feature_index, start_index=start_index, end_index=end_index, start_logit=result.start_logits[start_index], end_logit=result.end_logits[end_index])) if version_2_with_negative: prelim_predictions.append( _PrelimPrediction(feature_index=min_null_feature_index, start_index=0, end_index=0, start_logit=null_start_logit, end_logit=null_end_logit)) prelim_predictions = sorted(prelim_predictions, key=lambda x: (x.start_logit + x.end_logit), reverse=True) _NbestPrediction = collections.namedtuple( # pylint: disable=invalid-name "NbestPrediction", ["text", "start_logit", "end_logit"]) seen_predictions = {} nbest = [] for pred in prelim_predictions: if len(nbest) >= n_best_size: break feature = features[pred.feature_index] if pred.start_index > 0: # this is a non-null prediction tok_tokens = feature.tokens[pred.start_index:( pred.end_index + 1)] orig_doc_start = feature.token_to_orig_map[ pred.start_index] orig_doc_end = feature.token_to_orig_map[pred.end_index] orig_tokens = example.doc_tokens[orig_doc_start:( orig_doc_end + 1)] tok_text = " ".join(tok_tokens) # De-tokenize WordPieces that have been split off. tok_text = tok_text.replace(" ##", "") tok_text = tok_text.replace("##", "") # Clean whitespace tok_text = tok_text.strip() tok_text = " ".join(tok_text.split()) orig_text = " ".join(orig_tokens) final_text = self.get_final_text(tok_text, orig_text, do_lower_case, verbose_logging) if final_text in seen_predictions: continue seen_predictions[final_text] = True else: final_text = "" seen_predictions[final_text] = True nbest.append( _NbestPrediction(text=final_text, start_logit=pred.start_logit, end_logit=pred.end_logit)) # if we didn't include the empty option in the n-best, include it if version_2_with_negative: if "" not in seen_predictions: nbest.append( _NbestPrediction(text="", start_logit=null_start_logit, end_logit=null_end_logit)) # In very rare edge cases we could only have single null prediction. # So we just create a nonce prediction in this case to avoid failure. if len(nbest) == 1: nbest.insert( 0, _NbestPrediction(text="empty", start_logit=0.0, end_logit=0.0)) # In very rare edge cases we could have no valid predictions. So we # just create a nonce prediction in this case to avoid failure. if not nbest: nbest.append( _NbestPrediction(text="empty", start_logit=0.0, end_logit=0.0)) assert len(nbest) >= 1 total_scores = [] best_non_null_entry = None for entry in nbest: total_scores.append(entry.start_logit + entry.end_logit) if not best_non_null_entry: if entry.text: best_non_null_entry = entry probs = self._compute_softmax(total_scores) nbest_json = [] for (i, entry) in enumerate(nbest): output = collections.OrderedDict() output["text"] = entry.text output["probability"] = probs[i] output["start_logit"] = entry.start_logit output["end_logit"] = entry.end_logit nbest_json.append(output) assert len(nbest_json) >= 1 if not version_2_with_negative: all_predictions[example.qas_id] = nbest_json[0]["text"] else: # predict "" iff the null score - the score of best non-null > threshold score_diff = score_null - best_non_null_entry.start_logit - ( best_non_null_entry.end_logit) scores_diff_json[example.qas_id] = score_diff if score_diff > null_score_diff_threshold: all_predictions[example.qas_id] = "" else: all_predictions[example.qas_id] = best_non_null_entry.text all_nbest_json[example.qas_id] = nbest_json ''' with open(output_prediction_file, "w") as writer: writer.write(json.dumps(all_predictions, indent=4) + "\n") with open(output_nbest_file, "w") as writer: writer.write(json.dumps(all_nbest_json, indent=4) + "\n") if version_2_with_negative: with open(output_null_log_odds_file, "w") as writer: writer.write(json.dumps(scores_diff_json, indent=4) + "\n") ''' return all_predictions[example.qas_id] def get_final_text(self, pred_text, orig_text, do_lower_case, verbose_logging=False): """Project the tokenized prediction back to the original text.""" # When we created the data, we kept track of the alignment between original # (whitespace tokenized) tokens and our WordPiece tokenized tokens. So # now `orig_text` contains the span of our original text corresponding to the # span that we predicted. # # However, `orig_text` may contain extra characters that we don't want in # our prediction. # # For example, let's say: # pred_text = steve smith # orig_text = Steve Smith's # # We don't want to return `orig_text` because it contains the extra "'s". # # We don't want to return `pred_text` because it's already been normalized # (the SQuAD eval script also does punctuation stripping/lower casing but # our tokenizer does additional normalization like stripping accent # characters). # # What we really want to return is "Steve Smith". # # Therefore, we have to apply a semi-complicated alignment heuristic between # `pred_text` and `orig_text` to get a character-to-character alignment. This # can fail in certain cases in which case we just return `orig_text`. def _strip_spaces(text): ns_chars = [] ns_to_s_map = collections.OrderedDict() for (i, c) in enumerate(text): if c == " ": continue ns_to_s_map[len(ns_chars)] = i ns_chars.append(c) ns_text = "".join(ns_chars) return (ns_text, ns_to_s_map) # We first tokenize `orig_text`, strip whitespace from the result # and `pred_text`, and check if they are the same length. If they are # NOT the same length, the heuristic has failed. If they are the same # length, we assume the characters are one-to-one aligned. tokenizer = BasicTokenizer(do_lower_case=do_lower_case) tok_text = " ".join(tokenizer.tokenize(orig_text)) start_position = tok_text.find(pred_text) if start_position == -1: if verbose_logging: logger.info("Unable to find text: '%s' in '%s'" % (pred_text, orig_text)) return orig_text end_position = start_position + len(pred_text) - 1 (orig_ns_text, orig_ns_to_s_map) = _strip_spaces(orig_text) (tok_ns_text, tok_ns_to_s_map) = _strip_spaces(tok_text) if len(orig_ns_text) != len(tok_ns_text): if verbose_logging: logger.info( "Length not equal after stripping spaces: '%s' vs '%s'", orig_ns_text, tok_ns_text) return orig_text # We then project the characters in `pred_text` back to `orig_text` using # the character-to-character alignment. tok_s_to_ns_map = {} for (i, tok_index) in tok_ns_to_s_map.items(): tok_s_to_ns_map[tok_index] = i orig_start_position = None if start_position in tok_s_to_ns_map: ns_start_position = tok_s_to_ns_map[start_position] if ns_start_position in orig_ns_to_s_map: orig_start_position = orig_ns_to_s_map[ns_start_position] if orig_start_position is None: if verbose_logging: logger.info("Couldn't map start position") return orig_text orig_end_position = None if end_position in tok_s_to_ns_map: ns_end_position = tok_s_to_ns_map[end_position] if ns_end_position in orig_ns_to_s_map: orig_end_position = orig_ns_to_s_map[ns_end_position] if orig_end_position is None: if verbose_logging: logger.info("Couldn't map end position") return orig_text output_text = orig_text[orig_start_position:(orig_end_position + 1)] return output_text def _get_best_indexes(self, logits, n_best_size): """Get the n-best logits from a list.""" index_and_score = sorted(enumerate(logits), key=lambda x: x[1], reverse=True) best_indexes = [] for i in range(len(index_and_score)): if i >= n_best_size: break best_indexes.append(index_and_score[i][0]) return best_indexes def _compute_softmax(self, scores): """Compute softmax probability over raw logits.""" if not scores: return [] max_score = None for score in scores: if max_score is None or score > max_score: max_score = score exp_scores = [] total_sum = 0.0 for score in scores: x = math.exp(score - max_score) exp_scores.append(x) total_sum += x probs = [] for score in exp_scores: probs.append(score / total_sum) return probs def get_answer(self, question, article): if self.DO_PREDICT and (self.LOCAL_RANK == -1 or torch.distributed.get_rank() == 0): def is_whitespace(c): if c == " " or c == "\t" or c == "\r" or c == "\n" or ord( c) == 0x202F: return True return False doc_tokens = [] char_to_word_offset = [] prev_is_whitespace = True for c in article: if is_whitespace(c): prev_is_whitespace = True else: if prev_is_whitespace: doc_tokens.append(c) else: doc_tokens[-1] += c prev_is_whitespace = False char_to_word_offset.append(len(doc_tokens) - 1) eval_examples = [] example = SquadExample(qas_id="1234", question_text=question, doc_tokens=doc_tokens, orig_answer_text=None, start_position=None, end_position=None, is_impossible=False) eval_examples.append(example) eval_features = self.convert_examples_to_features( examples=eval_examples, tokenizer=self.tokenizer, max_seq_length=self.MAX_SEQ_LENGTH, doc_stride=self.DOC_STRIDE, max_query_length=self.MAX_QUERY_LENGTH, is_training=False) all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long) all_input_mask = torch.tensor( [f.input_mask for f in eval_features], dtype=torch.long) all_segment_ids = torch.tensor( [f.segment_ids for f in eval_features], dtype=torch.long) all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_example_index) # Run prediction for full data eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=self.PREDICT_BATCH_SIZE) self.model.eval() all_results = [] for input_ids, input_mask, segment_ids, example_indices in tqdm( eval_dataloader, desc="Evaluating"): input_ids = input_ids.to(self.device) input_mask = input_mask.to(self.device) segment_ids = segment_ids.to(self.device) with torch.no_grad(): batch_start_logits, batch_end_logits = self.model( input_ids, segment_ids, input_mask) for i, example_index in enumerate(example_indices): start_logits = batch_start_logits[i].detach().cpu().tolist( ) end_logits = batch_end_logits[i].detach().cpu().tolist() eval_feature = eval_features[example_index.item()] unique_id = int(eval_feature.unique_id) all_results.append( self.RawResult(unique_id=unique_id, start_logits=start_logits, end_logits=end_logits)) output_prediction_file = os.path.join(self.OUTPUT_DIR, "predictions.json") output_nbest_file = os.path.join(self.OUTPUT_DIR, "nbest_predictions.json") output_null_log_odds_file = os.path.join(self.OUTPUT_DIR, "null_odds.json") return self.write_predictions( eval_examples, eval_features, all_results, self.N_BEST_SIZE, self.MAX_ANSWER_LENGTH, self.DO_LOWER_CASE, output_prediction_file, output_nbest_file, output_null_log_odds_file, self.VERBOSE_LOGGING, self.VERSION_2_WITH_NEGATIVE, self.NULL_SCORE_DIFF_THRESHOLD)
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument( "--bert_model", default=None, type=str, required=True, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese." ) parser.add_argument( "--output_dir", default=None, type=str, required=True, help= "The output directory where the model checkpoints and predictions will be written." ) ## Other parameters parser.add_argument("--train_file", default=None, type=str, help="SQuAD json for training. E.g., train-v1.1.json") parser.add_argument( "--predict_file", default=None, type=str, help="SQuAD json for predictions. E.g., dev-v1.1.json or test-v1.1.json" ) parser.add_argument( "--max_seq_length", default=384, type=int, help= "The maximum total input sequence length after WordPiece tokenization. Sequences " "longer than this will be truncated, and sequences shorter than this will be padded." ) parser.add_argument( "--doc_stride", default=128, type=int, help= "When splitting up a long document into chunks, how much stride to take between chunks." ) parser.add_argument( "--max_query_length", default=64, type=int, help= "The maximum number of tokens for the question. Questions longer than this will " "be truncated to this length.") parser.add_argument("--do_train", default=False, action='store_true', help="Whether to run training.") parser.add_argument("--do_predict", default=False, action='store_true', help="Whether to run eval on the dev set.") parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.") parser.add_argument("--predict_batch_size", default=8, type=int, help="Total batch size for predictions.") parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument( "--warmup_proportion", default=0.1, type=float, help= "Proportion of training to perform linear learning rate warmup for. E.g., 0.1 = 10% " "of training.") parser.add_argument( "--n_best_size", default=20, type=int, help= "The total number of n-best predictions to generate in the nbest_predictions.json " "output file.") parser.add_argument( "--max_answer_length", default=30, type=int, help= "The maximum length of an answer that can be generated. This is needed because the start " "and end predictions are not conditioned on one another.") parser.add_argument( "--verbose_logging", default=False, action='store_true', help= "If true, all of the warnings related to data processing will be printed. " "A number of warnings are expected for a normal SQuAD evaluation.") parser.add_argument("--no_cuda", default=False, action='store_true', help="Whether not to use CUDA when available") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument( '--gradient_accumulation_steps', type=int, default=1, help= "Number of updates steps to accumulate before performing a backward/update pass." ) parser.add_argument( "--do_lower_case", default=True, action='store_true', help= "Whether to lower case the input text. True for uncased models, False for cased models." ) parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument( '--fp16', default=False, action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument( '--loss_scale', type=float, default=0, help= "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.\n") args = parser.parse_args() if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") # device = torch.device("cuda:1") n_gpu = torch.cuda.device_count() # n_gpu = 1 else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logger.info( "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}". format(device, n_gpu, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(args.gradient_accumulation_steps)) args.train_batch_size = int(args.train_batch_size / args.gradient_accumulation_steps) random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if not args.do_train and not args.do_predict: raise ValueError( "At least one of `do_train` or `do_predict` must be True.") if args.do_train: if not args.train_file: raise ValueError( "If `do_train` is True, then `train_file` must be specified.") if args.do_predict: if not args.predict_file: raise ValueError( "If `do_predict` is True, then `predict_file` must be specified." ) if os.path.exists(args.output_dir) and os.listdir(args.output_dir): raise ValueError( "Output directory () already exists and is not empty.") os.makedirs(args.output_dir, exist_ok=True) tokenizer = BertTokenizer.from_pretrained(args.bert_model) train_examples = None num_train_steps = None if args.do_train: train_examples = read_squad_examples(input_file=args.train_file, is_training=True) num_train_steps = int( len(train_examples) / args.train_batch_size / args.gradient_accumulation_steps * args.num_train_epochs) # Prepare model model = BertForQuestionAnswering.from_pretrained( args.bert_model, cache_dir=PYTORCH_PRETRAINED_BERT_CACHE / 'distributed_{}'.format(args.local_rank)) if args.fp16: model.half() model.to(device) if args.local_rank != -1: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) model = DDP(model) elif n_gpu > 1: model = torch.nn.DataParallel(model) # Prepare optimizer param_optimizer = list(model.named_parameters()) # hack to remove pooler, which is not used # thus it produce None grad that break apex param_optimizer = [n for n in param_optimizer if 'pooler' not in n[0]] no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] t_total = num_train_steps if args.local_rank != -1: t_total = t_total // torch.distributed.get_world_size() if args.fp16: try: from apex.optimizers import FP16_Optimizer from apex.optimizers import FusedAdam except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) optimizer = FusedAdam(optimizer_grouped_parameters, lr=args.learning_rate, bias_correction=False, max_grad_norm=1.0) if args.loss_scale == 0: optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) else: optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale) else: optimizer = BertAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=t_total) global_step = 0 if args.do_train: cached_train_features_file = args.train_file + '_{1}_{2}_{3}'.format( args.bert_model, str(args.max_seq_length), str(args.doc_stride), str(args.max_query_length)) train_features = None try: with open(cached_train_features_file, "rb") as reader: train_features = pickle.load(reader) except: train_features = convert_examples_to_features( examples=train_examples, tokenizer=tokenizer, max_seq_length=args.max_seq_length, doc_stride=args.doc_stride, max_query_length=args.max_query_length, is_training=True) if args.local_rank == -1 or torch.distributed.get_rank() == 0: logger.info(" Saving train features into cached file %s", cached_train_features_file) with open(cached_train_features_file, "wb") as writer: pickle.dump(train_features, writer) logger.info("***** Running training *****") logger.info(" Num orig examples = %d", len(train_examples)) logger.info(" Num split examples = %d", len(train_features)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_steps) all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long) all_start_positions = torch.tensor( [f.start_position for f in train_features], dtype=torch.long) all_end_positions = torch.tensor( [f.end_position for f in train_features], dtype=torch.long) train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_start_positions, all_end_positions) if args.local_rank == -1: train_sampler = RandomSampler(train_data) else: train_sampler = DistributedSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) model.train() for _ in trange(int(args.num_train_epochs), desc="Epoch"): for step, batch in enumerate( tqdm(train_dataloader, desc="Iteration")): if n_gpu == 1: batch = tuple( t.to(device) for t in batch) # multi-gpu does scattering it-self input_ids, input_mask, segment_ids, start_positions, end_positions = batch loss = model(input_ids, segment_ids, input_mask, start_positions, end_positions) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps print(loss, end=" \r") if args.fp16: optimizer.backward(loss) else: loss.backward() if (step + 1) % args.gradient_accumulation_steps == 0: # modify learning rate with special warm up BERT uses lr_this_step = args.learning_rate * warmup_linear( global_step / t_total, args.warmup_proportion) for param_group in optimizer.param_groups: param_group['lr'] = lr_this_step optimizer.step() optimizer.zero_grad() global_step += 1 # Save a trained model model_to_save = model.module if hasattr( model, 'module') else model # Only save the model it-self output_model_file = os.path.join(args.output_dir, "pytorch_model.bin") torch.save(model_to_save.state_dict(), output_model_file) # Load a trained model that you have fine-tuned model_state_dict = torch.load(output_model_file) model = BertForQuestionAnswering.from_pretrained( args.bert_model, state_dict=model_state_dict) model.to(device) if args.do_predict and (args.local_rank == -1 or torch.distributed.get_rank() == 0): eval_examples = read_squad_examples(input_file=args.predict_file, is_training=False) eval_features = convert_examples_to_features( examples=eval_examples, tokenizer=tokenizer, max_seq_length=args.max_seq_length, doc_stride=args.doc_stride, max_query_length=args.max_query_length, is_training=False) logger.info("***** Running predictions *****") logger.info(" Num orig examples = %d", len(eval_examples)) logger.info(" Num split examples = %d", len(eval_features)) logger.info(" Batch size = %d", args.predict_batch_size) all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long) all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_example_index) # Run prediction for full data eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.predict_batch_size) model.eval() all_results = [] logger.info("Start evaluating") for input_ids, input_mask, segment_ids, example_indices in tqdm( eval_dataloader, desc="Evaluating"): if len(all_results) % 1000 == 0: logger.info("Processing example: %d" % (len(all_results))) input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) with torch.no_grad(): batch_start_logits, batch_end_logits = model( input_ids, segment_ids, input_mask) for i, example_index in enumerate(example_indices): start_logits = batch_start_logits[i].detach().cpu().tolist() end_logits = batch_end_logits[i].detach().cpu().tolist() eval_feature = eval_features[example_index.item()] unique_id = int(eval_feature.unique_id) all_results.append( RawResult(unique_id=unique_id, start_logits=start_logits, end_logits=end_logits)) output_prediction_file = os.path.join(args.output_dir, "predictions.json") output_nbest_file = os.path.join(args.output_dir, "nbest_predictions.json") write_predictions(eval_examples, eval_features, all_results, args.n_best_size, args.max_answer_length, args.do_lower_case, output_prediction_file, output_nbest_file, args.verbose_logging)
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument( "--bert_model", default=None, type=str, required=True, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese." ) parser.add_argument( "--output_dir", default=None, type=str, required=True, help= "The output directory where the model checkpoints and predictions will be written." ) ## Other parameters parser.add_argument("--bin_path", default=None, type=str, help="bin path") parser.add_argument( "--predict_file", default=None, type=str, help="SQuAD json for predictions. E.g., dev-v1.1.json or test-v1.1.json" ) parser.add_argument( "--max_seq_length", default=384, type=int, help= "The maximum total input sequence length after WordPiece tokenization. Sequences " "longer than this will be truncated, and sequences shorter than this will be padded." ) parser.add_argument( "--doc_stride", default=128, type=int, help= "When splitting up a long document into chunks, how much stride to take between chunks." ) parser.add_argument( "--max_query_length", default=64, type=int, help= "The maximum number of tokens for the question. Questions longer than this will " "be truncated to this length.") parser.add_argument("--predict_batch_size", default=8, type=int, help="Total batch size for predictions.") parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument( "--warmup_proportion", default=0.1, type=float, help= "Proportion of training to perform linear learning rate warmup for. E.g., 0.1 = 10% " "of training.") parser.add_argument( "--n_best_size", default=5, type=int, help= "The total number of n-best predictions to generate in the nbest_predictions.json " "output file.") parser.add_argument( "--max_answer_length", default=500, type=int, help= "The maximum length of an answer that can be generated. This is needed because the start " "and end predictions are not conditioned on one another.") parser.add_argument( "--verbose_logging", default=False, action='store_true', help= "If true, all of the warnings related to data processing will be printed. " "A number of warnings are expected for a normal SQuAD evaluation.") parser.add_argument("--no_cuda", default=False, action='store_true', help="Whether not to use CUDA when available") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument( '--gradient_accumulation_steps', type=int, default=1, help= "Number of updates steps to accumulate before performing a backward/update pass." ) parser.add_argument( "--do_lower_case", default=True, action='store_true', help= "Whether to lower case the input text. True for uncased models, False for cased models." ) parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument( '--fp16', default=False, action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument( '--loss_scale', type=float, default=0, help= "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.\n") args = parser.parse_args() if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logger.info( "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}". format(device, n_gpu, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(args.gradient_accumulation_steps)) # args.train_batch_size = int(args.train_batch_size / args.gradient_accumulation_steps) random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) # if not args.do_train and not args.do_predict: # raise ValueError("At least one of `do_train` or `do_predict` must be True.") # if args.do_train: # if not args.train_file: # raise ValueError( # "If `do_train` is True, then `train_file` must be specified.") # if args.do_predict: # if not args.predict_file: # raise ValueError( # "If `do_predict` is True, then `predict_file` must be specified.") if os.path.exists(args.output_dir) and os.listdir(args.output_dir): raise ValueError( "Output directory () already exists and is not empty.") os.makedirs(args.output_dir, exist_ok=True) tokenizer = BertTokenizer.from_pretrained(args.bert_model) # Save a trained model # model_to_save = model.module if hasattr(model, 'module') else model # Only save the model it-self # output_model_file = os.path.join(args.output_dir, "pytorch_model.bin") # torch.save(model_to_save.state_dict(), output_model_file) # Load a trained model that you have fine-tuned model_state_dict = torch.load(args.bin_path) model = BertForQuestionAnswering.from_pretrained( args.bert_model, state_dict=model_state_dict) model.to(device) eval_examples = read_squad_examples(input_file=args.predict_file, is_training=False) eval_features = convert_examples_to_features( examples=eval_examples, tokenizer=tokenizer, max_seq_length=args.max_seq_length, doc_stride=args.doc_stride, max_query_length=args.max_query_length, is_training=False) logger.info("***** Running predictions *****") logger.info(" Num orig examples = %d", len(eval_examples)) logger.info(" Num split examples = %d", len(eval_features)) logger.info(" Batch size = %d", args.predict_batch_size) all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long) all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_example_index) # Run prediction for full data eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.predict_batch_size) model.eval() all_results = [] logger.info("Start evaluating") for input_ids, input_mask, segment_ids, example_indices in tqdm( eval_dataloader, desc="Evaluating"): if len(all_results) % 1000 == 0: logger.info("Processing example: %d" % (len(all_results))) input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) with torch.no_grad(): batch_start_logits, batch_end_logits = model( input_ids, segment_ids, input_mask) for i, example_index in enumerate(example_indices): start_logits = batch_start_logits[i].detach().cpu().tolist() end_logits = batch_end_logits[i].detach().cpu().tolist() eval_feature = eval_features[example_index.item()] unique_id = int(eval_feature.unique_id) all_results.append( RawResult(unique_id=unique_id, start_logits=start_logits, end_logits=end_logits)) output_prediction_file = os.path.join(args.output_dir, "predictions.json") output_nbest_file = os.path.join(args.output_dir, "nbest_predictions.json") write_predictions(eval_examples, eval_features, all_results, args.n_best_size, args.max_answer_length, args.do_lower_case, output_prediction_file, output_nbest_file, args.verbose_logging)
def answer_prediction(paras,question,model,config_file,max_seq_length=384,doc_stride=128,max_query_length=64,max_answer_length=60): #para_file = 'Input_file.txt' model_path = model device = torch.device("cuda" if torch.cuda.is_available() else "cpu") n_gpu = torch.cuda.device_count() ### Raeding paragraph ## Reading question # f = open(ques_file, 'r') # ques = f.read() # f.close() ## input_data is a list of dictionary which has a paragraph and questions #para_list = para.split('\n\n') #print(paras) input_data = [] i = 1 for i,para in enumerate(paras): # print(para) paragraphs = {} #splits = para.split('\nQuestions:') paragraphs['id'] = i paragraphs['text'] = para paragraphs['ques']= question input_data.append(paragraphs) examples = read_paragraphs(input_data,question) tokenizer = AutoTokenizer.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad', do_lower_case=True) eval_features = convert_examples_to_features( examples = examples, tokenizer=tokenizer, max_seq_length=max_seq_length, doc_stride=doc_stride, max_query_length=max_query_length) all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long) all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long) ### Loading Pretrained model for QnA config = BertConfig(config_file) model = BertForQuestionAnswering(config) model.load_state_dict(torch.load(model_path,map_location='cpu')) model.to(device) pred_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_example_index) # Run prediction for full data pred_sampler = SequentialSampler(pred_data) pred_dataloader = DataLoader(pred_data, sampler=pred_sampler, batch_size=10) predictions = [] for input_ids, input_mask, segment_ids, example_indices in tqdm(pred_dataloader): model.eval() input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) with torch.no_grad(): batch_start_logits, batch_end_logits = model(input_ids, segment_ids, input_mask) features=[] example = [] all_results = [] for i, example_index in enumerate(example_indices): start_logits = batch_start_logits[i].detach().cpu().tolist() end_logits = batch_end_logits[i].detach().cpu().tolist() feature = eval_features[example_index.item()] unique_id = int(feature.unique_id) features.append(feature) all_results.append(RawResult(unique_id=unique_id, start_logits=start_logits, end_logits=end_logits)) output = predict(examples, features, all_results,max_answer_length) predictions.append(output) ### For printing the results #### final_preds = [] final_paras = [] final_probs = [] final_scores = [] final_ques = [] index = None for i,example in enumerate(examples): if index!= example.example_id: index = example.example_id # ques_text = colored(example.question_text, 'blue') prediction = predictions[math.floor(example.unique_id/12)][example] prob = predictions[math.floor(example.unique_id/12)]['prob'+str(example)] final_ques.append(ques_text) final_preds.append(prediction) final_paras.append(example.para_text) final_probs.append(prob) return final_ques,final_preds,final_paras,final_probs
def __init__(self): self.bert_model = "bert-base-uncased" self.pretrained_model = "./pretrained" self.output_dir = "./tmp" self.max_seq_length = 384 self.doc_stride = 128 self.max_query_length = 64 self.do_predict = True self.predict_batch_size = 8 self.n_best_size = 20 self.max_query_length = 30 self.verbose_logging = False self.no_cuda = False self.seed = 42 self.fp16 = False self.local_rank = -1 self.max_answer_length = 30 self.do_lower_case = True if self.local_rank == -1 or self.no_cuda: self.device = torch.device("cuda" if torch.cuda.is_available() and not self.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: self.device = torch.device("cuda", self.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') if self.fp16: logger.info( "16-bits training currently not supported in distributed training" ) self.fp16 = False # (see https://github.com/pytorch/pytorch/pull/13496) logger.info( "device: {} n_gpu: {}, distributed training: {}, 16-bits trainiing: {}" .format(self.device, n_gpu, bool(self.local_rank != -1), self.fp16)) random.seed(self.seed) np.random.seed(self.seed) torch.manual_seed(self.seed) if n_gpu > 0: torch.cuda.manual_seed_all(self.seed) if os.path.exists(self.output_dir) and os.listdir(self.output_dir): raise ValueError( "Output directory () already exists and is not empty.") os.makedirs(self.output_dir, exist_ok=True) self.tokenizer = BertTokenizer.from_pretrained(self.bert_model) # Prepare model self.model = BertForQuestionAnswering.from_pretrained( self.pretrained_model, cache_dir=PYTORCH_PRETRAINED_BERT_CACHE / 'distributed_{}'.format(self.local_rank)) if self.fp16: self.model.half() self.model.to(self.device) if self.local_rank != -1: self.model = torch.nn.parallel.DistributedDataParallel( self.model, device_ids=[self.local_rank], output_device=self.local_rank) elif n_gpu > 1: self.model = torch.nn.DataParallel(self.model) self.model.eval()
def main(): args = set_config() random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) # Prepare model model = BertForQuestionAnswering2.from_pretrained(args.bert_model) # model = BertForQuestionAnswering1.from_pretrained(args.bert_model) device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") # output_model_file = os.path.join(args.output_dir, "pytorch_model_{}.bin".format(0)) # model_state_dict = torch.load(output_model_file) # model = BertForQuestionAnswering2.from_pretrained(args.bert_model, state_dict=model_state_dict) model.to(device) optimizer = torch.optim.Adam(model.parameters(), lr=args.learning_rate) global_step = 0 if args.do_train: # load train data train_examples, train_features = get_train_feature( args, args.do_train, tokenizer) train_data = DataIteratorPack(train_features, train_examples, args.train_batch_size, device, sent_limit=None, sequential=False) # train_example_dict = example_dict(train_examples) # load dev data eval_examples, eval_features = get_train_feature( args, not args.do_train, tokenizer) logger.info("***** Running training *****") logger.info(" Num split examples = %d", len(train_features)) logger.info(" Batch size = %d", args.train_batch_size) total_train_loss = 0 VERBOSE_STEP = 100 grad_accumulate_step = 1 for epoch in trange(int(args.num_train_epochs), desc="Epoch"): model.train() # learning rate decay # if epoch > 1: # args.learning_rate = args.learning_rate * args.decay # for param_group in optimizer.param_groups: # param_group['lr'] = args.learning_rate # print('lr = {}'.format(args.learning_rate)) for step, batch in enumerate(train_data): # batch = tuple(t.to(device) for t in batch) # multi-gpu does scattering it-self input_ids = batch["context_idxs"] input_mask = batch["context_mask"] segment_ids = batch["segment_idxs"] start_positions = batch["y1"] end_positions = batch["y2"] q_types = batch["q_type"] loss = model(input_ids, segment_ids, input_mask, start_positions, end_positions, q_types) if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps loss.backward() if (global_step + 1) % grad_accumulate_step == 0: optimizer.step() optimizer.zero_grad() total_train_loss += loss global_step += 1 if global_step % VERBOSE_STEP == 0: print("-- In Epoch{}: ".format(epoch)) print("Avg-LOSS: {}/batch/step: {}".format( total_train_loss / VERBOSE_STEP, global_step / VERBOSE_STEP)) total_train_loss = 0 # Save a trained model model_to_save = model.module if hasattr( model, 'module') else model # Only save the model it-self output_model_file = os.path.join( args.output_dir, "pytorch_model_{}.bin".format(epoch)) if global_step % VERBOSE_STEP == 0: if args.do_train: torch.save(model_to_save.state_dict(), output_model_file) # Load a trained model that you have fine-tuned # model_state_dict = torch.load(output_model_file) # model = BertForQuestionAnswering.from_pretrained(args.bert_model, state_dict=model_state_dict) else: model = BertForQuestionAnswering.from_pretrained( args.bert_model) model.to(device) train_data.refresh() if args.do_predict: eval_examples_dict = example_dict(eval_examples) # eval_features_dict = example_dict(eval_features) logger.info("***** Running predictions *****") logger.info(" Num split examples = %d", len(eval_features)) logger.info(" Batch size = %d", args.predict_batch_size) all_input_ids = torch.tensor( [f.doc_input_ids for f in eval_features], dtype=torch.long) all_input_mask = torch.tensor( [f.doc_input_mask for f in eval_features], dtype=torch.long) all_segment_ids = torch.tensor( [f.doc_segment_ids for f in eval_features], dtype=torch.long) all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_example_index) eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader( eval_data, sampler=eval_sampler, batch_size=args.predict_batch_size) # eval_dataloader = DataIteratorPack(eval_features, eval_examples, args.train_batch_size, device, sent_limit=None, # sequential=False) model.eval() all_results = [] logger.info("Start evaluating") for input_ids, input_mask, segment_ids, example_indices in tqdm( eval_dataloader, desc="Evaluating"): if len(all_results) % 1000 == 0: logger.info("Processing example: %d" % (len(all_results))) input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) with torch.no_grad(): batch_start_logits, batch_end_logits, batch_types = model( input_ids, segment_ids, input_mask) for i, example_index in enumerate(example_indices): start_logits = [ batch_start_logits[j][i].detach().cpu().tolist() for j in range(len((batch_start_logits))) ] end_logits = [ batch_end_logits[j][i].detach().cpu().tolist() for j in range(len((batch_end_logits))) ] eval_feature = eval_features[example_index.item()] unique_id = int(eval_feature.unique_id) types = batch_types[i].detach().cpu().tolist() all_results.append( RawResult(unique_id=unique_id, start_logits=start_logits, end_logits=end_logits, types=types)) output_prediction_file = os.path.join( args.output_dir, "predictions_{}.json".format(epoch)) output_nbest_file = os.path.join( args.output_dir, "nbest_predictions_{}.json".format(epoch)) output_null_log_odds_file = os.path.join( args.output_dir, "null_odds_{}.json".format(epoch)) all_predictions = write_predictions( eval_examples, eval_features, all_results, args.n_best_size, args.max_answer_length, args.do_lower_case, output_prediction_file, output_nbest_file, output_null_log_odds_file, args.verbose_logging, args.version_2_with_negative, args.null_score_diff_threshold) metrics = evaluate(eval_examples_dict, all_predictions) print('epoch {:3d} | EM {:.4f} | F1 {:.4f}'.format( epoch, metrics['exact_match'], metrics['f1']))
import math import os import random import pickle from tqdm import tqdm, trange import numpy as np import torch from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler from torch.utils.data.distributed import DistributedSampler from pytorch_pretrained_bert.tokenization import whitespace_tokenize, BasicTokenizer, BertTokenizer from pytorch_pretrained_bert.modeling import BertForQuestionAnswering from pytorch_pretrained_bert.optimization import BertAdam, warmup_linear from pytorch_pretrained_bert.file_utils import PYTORCH_PRETRAINED_BERT_CACHE model = BertForQuestionAnswering.from_pretrained("bert-base-uncased") device = "cuda:0" model.train(device) param_optimizer = list(model.named_parameters()) param_optimizer = [n for n in param_optimizer if 'pooler' not in n[0]] no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [ {'params': [p for n, p in param_optimizer if not any( nd in n for nd in no_decay)], 'weight_decay': 0.01}, {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} ] optimizer = torch.optim.Adam(optimizer_grouped_parameters, lr=0.0001) print(optimizer_grouped_parameters)
def main(args): device = torch.device( "cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() logger.info("device: {}, n_gpu: {}, 16-bits training: {}".format( device, n_gpu, args.fp16)) random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if args.gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(args.gradient_accumulation_steps)) args.train_batch_size = \ args.train_batch_size // args.gradient_accumulation_steps if not args.do_train and not args.do_eval: raise ValueError( "At least one of `do_train` or `do_eval` must be True.") if args.do_train: assert (args.train_file is not None) and (args.dev_file is not None) if args.eval_test: assert args.test_file is not None else: assert args.dev_file is not None if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) if args.do_train: logger.addHandler( logging.FileHandler(os.path.join(args.output_dir, "train.log"), 'w')) else: logger.addHandler( logging.FileHandler(os.path.join(args.output_dir, "eval.log"), 'w')) logger.info(args) tokenizer = BertTokenizer.from_pretrained(args.model, do_lower_case=args.do_lower_case) if args.do_train or (not args.eval_test): with gzip.GzipFile(args.dev_file, 'r') as reader: content = reader.read().decode('utf-8').strip().split('\n')[1:] eval_dataset = [json.loads(line) for line in content] eval_examples = read_mrqa_examples(input_file=args.dev_file, is_training=False) eval_features = convert_examples_to_features( examples=eval_examples, tokenizer=tokenizer, max_seq_length=args.max_seq_length, doc_stride=args.doc_stride, max_query_length=args.max_query_length, is_training=False) logger.info("***** Dev *****") logger.info(" Num orig examples = %d", len(eval_examples)) logger.info(" Num split examples = %d", len(eval_features)) logger.info(" Batch size = %d", args.eval_batch_size) all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long) all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_example_index) eval_dataloader = DataLoader(eval_data, batch_size=args.eval_batch_size) if args.do_train: train_examples = read_mrqa_examples(input_file=args.train_file, is_training=True) train_features = convert_examples_to_features( examples=train_examples, tokenizer=tokenizer, max_seq_length=args.max_seq_length, doc_stride=args.doc_stride, max_query_length=args.max_query_length, is_training=True) if args.train_mode == 'sorted' or args.train_mode == 'random_sorted': train_features = sorted(train_features, key=lambda f: np.sum(f.input_mask)) else: random.shuffle(train_features) all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long) all_start_positions = torch.tensor( [f.start_position for f in train_features], dtype=torch.long) all_end_positions = torch.tensor( [f.end_position for f in train_features], dtype=torch.long) train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_start_positions, all_end_positions) train_dataloader = DataLoader(train_data, batch_size=args.train_batch_size) train_batches = [batch for batch in train_dataloader] num_train_optimization_steps = \ len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs logger.info("***** Train *****") logger.info(" Num orig examples = %d", len(train_examples)) logger.info(" Num split examples = %d", len(train_features)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_optimization_steps) eval_step = max(1, len(train_batches) // args.eval_per_epoch) best_result = None lrs = [args.learning_rate] if args.learning_rate else [ 1e-6, 2e-6, 3e-6, 5e-6, 1e-5, 2e-5, 3e-5, 5e-5 ] for lr in lrs: model = BertForQuestionAnswering.from_pretrained( args.model, cache_dir=PYTORCH_PRETRAINED_BERT_CACHE) if args.fp16: model.half() model.to(device) if n_gpu > 1: model = torch.nn.DataParallel(model) param_optimizer = list(model.named_parameters()) param_optimizer = [ n for n in param_optimizer if 'pooler' not in n[0] ] no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) ], 'weight_decay': 0.01 }, { 'params': [ p for n, p in param_optimizer if any(nd in n for nd in no_decay) ], 'weight_decay': 0.0 }] if args.fp16: try: from apex.optimizers import FP16_Optimizer from apex.optimizers import FusedAdam except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex" "to use distributed and fp16 training.") optimizer = FusedAdam(optimizer_grouped_parameters, lr=lr, bias_correction=False, max_grad_norm=1.0) if args.loss_scale == 0: optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) else: optimizer = FP16_Optimizer( optimizer, static_loss_scale=args.loss_scale) else: optimizer = BertAdam(optimizer_grouped_parameters, lr=lr, warmup=args.warmup_proportion, t_total=num_train_optimization_steps) tr_loss = 0 nb_tr_examples = 0 nb_tr_steps = 0 global_step = 0 start_time = time.time() for epoch in range(int(args.num_train_epochs)): model.train() logger.info("Start epoch #{} (lr = {})...".format(epoch, lr)) for step, batch in enumerate(train_batches): if n_gpu == 1: batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, start_positions, end_positions = batch loss = model(input_ids, segment_ids, input_mask, start_positions, end_positions) if n_gpu > 1: loss = loss.mean() if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps tr_loss += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 if args.fp16: optimizer.backward(loss) else: loss.backward() if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16: lr_this_step = lr * \ warmup_linear(global_step/num_train_optimization_steps, args.warmup_proportion) for param_group in optimizer.param_groups: param_group['lr'] = lr_this_step optimizer.step() optimizer.zero_grad() global_step += 1 if (step + 1) % eval_step == 0: logger.info( 'Epoch: {}, Step: {} / {}, used_time = {:.2f}s, loss = {:.6f}' .format(epoch, step + 1, len(train_dataloader), time.time() - start_time, tr_loss / nb_tr_steps)) save_model = False if args.do_eval: result, _, _ = \ evaluate(args, model, device, eval_dataset, eval_dataloader, eval_examples, eval_features) model.train() result['global_step'] = global_step result['epoch'] = epoch result['learning_rate'] = lr result['batch_size'] = args.train_batch_size if (best_result is None) or (result[args.eval_metric] > best_result[args.eval_metric]): best_result = result save_model = True logger.info( "!!! Best dev %s (lr=%s, epoch=%d): %.2f" % (args.eval_metric, str(lr), epoch, result[args.eval_metric])) else: save_model = True if save_model: model_to_save = model.module if hasattr( model, 'module') else model output_model_file = os.path.join( args.output_dir, WEIGHTS_NAME) output_config_file = os.path.join( args.output_dir, CONFIG_NAME) torch.save(model_to_save.state_dict(), output_model_file) model_to_save.config.to_json_file( output_config_file) tokenizer.save_vocabulary(args.output_dir) if best_result: with open( os.path.join(args.output_dir, EVAL_FILE), "w") as writer: for key in sorted(best_result.keys()): writer.write( "%s = %s\n" % (key, str(best_result[key]))) if args.do_eval: if args.eval_test: with gzip.GzipFile(args.test_file, 'r') as reader: content = reader.read().decode('utf-8').strip().split('\n')[1:] eval_dataset = [json.loads(line) for line in content] eval_examples = read_mrqa_examples(input_file=args.test_file, is_training=False) eval_features = convert_examples_to_features( examples=eval_examples, tokenizer=tokenizer, max_seq_length=args.max_seq_length, doc_stride=args.doc_stride, max_query_length=args.max_query_length, is_training=False) logger.info("***** Test *****") logger.info(" Num orig examples = %d", len(eval_examples)) logger.info(" Num split examples = %d", len(eval_features)) logger.info(" Batch size = %d", args.eval_batch_size) all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long) all_input_mask = torch.tensor( [f.input_mask for f in eval_features], dtype=torch.long) all_segment_ids = torch.tensor( [f.segment_ids for f in eval_features], dtype=torch.long) all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_example_index) eval_dataloader = DataLoader(eval_data, batch_size=args.eval_batch_size) model = BertForQuestionAnswering.from_pretrained(args.output_dir) if args.fp16: model.half() model.to(device) result, preds, nbest_preds = \ evaluate(args, model, device, eval_dataset, eval_dataloader, eval_examples, eval_features) with open(os.path.join(args.output_dir, PRED_FILE), "w") as writer: writer.write(json.dumps(preds, indent=4) + "\n") with open(os.path.join(args.output_dir, TEST_FILE), "w") as writer: for key in sorted(result.keys()): writer.write("%s = %s\n" % (key, str(result[key])))
def train(args): args.train_batch_size = int(args.train_batch_size / args.gradient_accumulation_steps) tokenizer = BertTokenizer.from_pretrained( modelconfig.MODEL_ARCHIVE_MAP[args.bert_model]) train_examples = data_utils.read_squad_examples(os.path.join( args.data_dir, "train.json"), is_training=True) num_train_steps = int( len(train_examples) / args.train_batch_size / args.gradient_accumulation_steps) * args.num_train_epochs train_features = data_utils.convert_examples_to_features( train_examples, tokenizer, args.max_seq_length, args.doc_stride, args.max_query_length, is_training=True) logger.info("***** Running training *****") logger.info(" Num orig examples = %d", len(train_examples)) logger.info(" Num split examples = %d", len(train_features)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_steps) all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long) all_start_positions = torch.tensor( [f.start_position for f in train_features], dtype=torch.long) all_end_positions = torch.tensor([f.end_position for f in train_features], dtype=torch.long) train_data = TensorDataset(all_input_ids, all_segment_ids, all_input_mask, all_start_positions, all_end_positions) train_sampler = RandomSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) # >>>>> validation if args.do_valid: valid_examples = data_utils.read_squad_examples(os.path.join( args.data_dir, "dev.json"), is_training=True) valid_features = data_utils.convert_examples_to_features( valid_examples, tokenizer, args.max_seq_length, args.doc_stride, args.max_query_length, is_training=True) valid_all_input_ids = torch.tensor( [f.input_ids for f in valid_features], dtype=torch.long) valid_all_segment_ids = torch.tensor( [f.segment_ids for f in valid_features], dtype=torch.long) valid_all_input_mask = torch.tensor( [f.input_mask for f in valid_features], dtype=torch.long) valid_all_start_positions = torch.tensor( [f.start_position for f in valid_features], dtype=torch.long) valid_all_end_positions = torch.tensor( [f.end_position for f in valid_features], dtype=torch.long) valid_data = TensorDataset(valid_all_input_ids, valid_all_segment_ids, valid_all_input_mask, valid_all_start_positions, valid_all_end_positions) logger.info("***** Running validations *****") logger.info(" Num orig examples = %d", len(valid_examples)) logger.info(" Num split examples = %d", len(valid_features)) logger.info(" Batch size = %d", args.train_batch_size) valid_sampler = SequentialSampler(valid_data) valid_dataloader = DataLoader(valid_data, sampler=valid_sampler, batch_size=args.train_batch_size) best_valid_loss = float('inf') valid_losses = [] # <<<<< end of validation declaration if not args.bert_model.endswith(".pt"): model = BertForQuestionAnswering.from_pretrained( modelconfig.MODEL_ARCHIVE_MAP[args.bert_model]) else: model = torch.load(args.bert_model) if args.fp16: model.half() model.cuda() # Prepare optimizer param_optimizer = [(k, v) for k, v in model.named_parameters() if v.requires_grad == True] param_optimizer = [n for n in param_optimizer if 'pooler' not in n[0]] no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] t_total = num_train_steps if args.fp16: try: from apex.optimizers import FP16_Optimizer from apex.optimizers import FusedAdam except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) optimizer = FusedAdam(optimizer_grouped_parameters, lr=args.learning_rate, bias_correction=False, max_grad_norm=1.0) if args.loss_scale == 0: optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) else: optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale) else: optimizer = BertAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=t_total) global_step = 0 model.train() for _ in range(args.num_train_epochs): for step, batch in enumerate(train_dataloader): batch = tuple(t.cuda() for t in batch) input_ids, segment_ids, input_mask, start_positions, end_positions = batch loss = model(input_ids, segment_ids, input_mask, start_positions, end_positions) if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: optimizer.backward(loss) else: loss.backward() if (step + 1) % args.gradient_accumulation_steps == 0: # modify learning rate with special warm up BERT uses lr_this_step = args.learning_rate * warmup_linear( global_step / t_total, args.warmup_proportion) for param_group in optimizer.param_groups: param_group['lr'] = lr_this_step optimizer.step() optimizer.zero_grad() global_step += 1 # >>>> perform validation at the end of each epoch . if args.do_valid: model.eval() with torch.no_grad(): losses = [] valid_size = 0 for step, batch in enumerate(valid_dataloader): batch = tuple( t.cuda() for t in batch) # multi-gpu does scattering it-self input_ids, segment_ids, input_mask, start_positions, end_positions = batch loss = model(input_ids, segment_ids, input_mask, start_positions, end_positions) losses.append(loss.data.item() * input_ids.size(0)) valid_size += input_ids.size(0) valid_loss = sum(losses) / valid_size logger.info("validation loss: %f", valid_loss) valid_losses.append(valid_loss) if valid_loss < best_valid_loss: torch.save(model, os.path.join(args.output_dir, "model.pt")) best_valid_loss = valid_loss model.train() if args.do_valid: with open(os.path.join(args.output_dir, "valid.json"), "w") as fw: json.dump({"valid_losses": valid_losses}, fw) else: torch.save(model, os.path.join(args.output_dir, "model.pt"))
def main(): parser = argparse.ArgumentParser() parser.add_argument("--paragraph", default=None, type=str) parser.add_argument("--model", default=None, type=str) parser.add_argument("--max_seq_length", default=384, type=int) parser.add_argument("--doc_stride", default=128, type=int) parser.add_argument("--max_query_length", default=64, type=int) parser.add_argument("--config_file", default=None, type=str) parser.add_argument("--max_answer_length", default=30, type=int) args = parser.parse_args() para_file = args.paragraph model_path = args.model device = torch.device("cuda" if torch.cuda.is_available() else "cpu") n_gpu = torch.cuda.device_count() ### Loading Pretrained model for QnA print("Loading BERT-model...\n\n") config = BertConfig(args.config_file) model = BertForQuestionAnswering(config) model.load_state_dict( torch.load(model_path, map_location=torch.device("cpu"))) model.to(device) tokenizer = BertTokenizer.from_pretrained("bert-base-uncased", do_lower_case=True) while True: print("Please specify paragraph: \n " "1: Assisted Time Holdover \n " "2: Semcon short version \n " "3: Semcon long version") choice = input() if choice == "1": break elif choice == "2": para_file = "bert/input/semcon_short.txt" break elif choice == "3": para_file = "bert/input/semcon.txt" break else: print("I did not understand that, please type in 1, 2 or 3. \n") ### Reading paragraph f = open(para_file, "r") para = f.read() f.close() print("\nParagraph:\n", para) while True: input_data = [] paragraphs = {} paragraphs["id"] = 1 # paragraphs["text"] = splits[0].replace("Paragraph:", "").strip("\n") paragraphs["text"] = para paragraphs["ques"] = [input("\n What is your question?\n")] if paragraphs["ques"] == ["exit"]: exit() start = time.time() input_data.append(paragraphs) ## input_data is a list of dictionary which has a paragraph and questions examples = read_squad_examples(input_data) eval_features = convert_examples_to_features( examples=examples, tokenizer=tokenizer, max_seq_length=args.max_seq_length, doc_stride=args.doc_stride, max_query_length=args.max_query_length, ) all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long) all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long) pred_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_example_index) # Run prediction for full data pred_sampler = SequentialSampler(pred_data) pred_dataloader = DataLoader(pred_data, sampler=pred_sampler, batch_size=9) predictions = [] for input_ids, input_mask, segment_ids, example_indices in pred_dataloader: input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) with torch.no_grad(): batch_start_logits, batch_end_logits = model( input_ids, segment_ids, input_mask) features = [] example = [] all_results = [] for i, example_index in enumerate(example_indices): start_logits = batch_start_logits[i].detach().cpu().tolist() end_logits = batch_end_logits[i].detach().cpu().tolist() feature = eval_features[example_index.item()] unique_id = int(feature.unique_id) features.append(feature) all_results.append( RawResult( unique_id=unique_id, start_logits=start_logits, end_logits=end_logits, )) output = predict(examples, features, all_results, args.max_answer_length) predictions.append(output) prediction = colored( predictions[math.floor(examples[0].unique_id / 12)][examples[0]], "green", attrs=["reverse"], ) print(prediction, "\n") print("Time: ", time.time() - start) """
def main(): args = set_config() random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) # Prepare model model = BertForQuestionAnswering.from_pretrained(args.bert_model) device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") model.to(device) optimizer = torch.optim.Adam(model.parameters(), lr=args.learning_rate) global_step = 0 if args.do_train: # load train data train_examples, train_features = get_train_feature( args, args.do_train, tokenizer) for f in train_features: print(len(f.doc_input_ids)) train_data = DataIteratorPack(train_features, train_examples, args.train_batch_size, device, sent_limit=25, sequential=False) # load dev data eval_examples, eval_features = get_train_feature( args, not args.do_train, tokenizer) eval_data = DataIteratorPack(eval_features, eval_examples, args.predict_batch_size, device, sent_limit=25, sequential=False) logger.info("***** Running training *****") logger.info(" Num split examples = %d", len(train_features)) logger.info(" Batch size = %d", args.train_batch_size) total_train_loss = 0 VERBOSE_STEP = 100 grad_accumulate_step = 1 best_dev_F1 = None for epoch in trange(int(args.num_train_epochs), desc="Epoch"): model.train() # learning rate decay # if epoch > 1: # args.learning_rate = args.learning_rate * args.decay # for param_group in optimizer.param_groups: # param_group['lr'] = args.learning_rate # print('lr = {}'.format(args.learning_rate)) for step, batch in enumerate(train_data): # batch = tuple(t.to(device) for t in batch) # multi-gpu does scattering it-self input_ids = batch["context_idxs"] input_mask = batch["context_mask"] segment_ids = batch["segment_idxs"] start_positions = batch["y1"] end_positions = batch["y2"] q_types = batch["q_type"] loss = model(input_ids, segment_ids, input_mask, start_positions, end_positions, q_types, batch=batch) if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps loss.backward() if (global_step + 1) % grad_accumulate_step == 0: optimizer.step() optimizer.zero_grad() total_train_loss += loss global_step += 1 if global_step % VERBOSE_STEP == 0: print("-- In Epoch{}: ".format(epoch)) print("Avg-LOSS: {}/batch/step: {}".format( total_train_loss / VERBOSE_STEP, global_step / VERBOSE_STEP)) total_train_loss = 0 # Save a trained model model_to_save = model.module if hasattr( model, 'module') else model # Only save the model it-self train_data.refresh() if args.do_predict: eval_examples_dict = example_dict(eval_examples) # eval_features_dict = example_dict(eval_features) logger.info("***** Running predictions *****") logger.info(" Num split examples = %d", len(eval_features)) logger.info(" Batch size = %d", args.predict_batch_size) model.eval() all_results = [] sp_dict = {} logger.info("Start evaluating") for step, batch in enumerate(eval_data): # batch = tuple(t.to(device) for t in batch) # multi-gpu does scattering it-self input_ids = batch["context_idxs"] input_mask = batch["context_mask"] segment_ids = batch["segment_idxs"] if len(all_results) % 1000 == 0: logger.info("Processing example: %d" % (len(all_results))) with torch.no_grad(): batch_start_logits, batch_end_logits, batch_types, sp = model( input_ids, segment_ids, input_mask, batch=batch) for i, example_index in enumerate(batch["ids"]): start_logits = batch_start_logits[i].detach().cpu( ).tolist() end_logits = batch_end_logits[i].detach().cpu().tolist( ) # eval_feature = eval_features[example_index.item()] unique_id = batch['unique_id'][i] print(unique_id) types = batch_types[i].detach().cpu().tolist() all_results.append( RawResult(unique_id=unique_id, start_logits=start_logits, end_logits=end_logits, types=types)) predict_support_np = torch.sigmoid( sp[:, :, 1]).data.cpu().numpy() for i in range(predict_support_np.shape[0]): cur_sp_pred = [] cur_id = batch['ids'][i] for j in range(predict_support_np.shape[1]): if j >= len(eval_examples_dict[cur_id].sent_names): break if predict_support_np[i, j] > args.sp_threshold: cur_sp_pred.append( eval_examples_dict[cur_id].sent_names[j]) sp_dict.update({cur_id: cur_sp_pred}) answer_dict = convert_to_tokens(eval_examples, eval_features, all_results) metrics = evaluate(eval_examples_dict, answer_dict) print('hotpotqa epoch {:3d} | EM {:.4f} | F1 {:.4f}'.format( epoch, metrics['exact_match'], metrics['f1'])) output_prediction_file = os.path.join( args.output_dir, "predictions_{}.json".format(epoch)) output_nbest_file = os.path.join( args.output_dir, "nbest_predictions_{}.json".format(epoch)) output_null_log_odds_file = os.path.join( args.output_dir, "null_odds_{}.json".format(epoch)) all_predictions = write_predictions( eval_examples, eval_features, all_results, args.n_best_size, args.max_answer_length, args.do_lower_case, output_prediction_file, output_nbest_file, output_null_log_odds_file, args.verbose_logging, args.version_2_with_negative, args.null_score_diff_threshold) metrics = evaluate(eval_examples_dict, all_predictions) print('squad epoch {:3d} | EM {:.4f} | F1 {:.4f}'.format( epoch, metrics['exact_match'], metrics['f1'])) dev_F1 = metrics['f1'] #learning rate decay if best_dev_F1 is None or dev_F1 > best_dev_F1: best_dev_F1 = dev_F1 output_model_file = os.path.join(args.output_dir, "pytorch_model.bin") torch.save(model_to_save.state_dict(), output_model_file) cur_patience = 0 else: cur_patience += 1 if cur_patience >= args.patience: for param_group in optimizer.param_groups: param_group['lr'] /= 2.0 if param_group['lr'] < 1e-6: stop_train = True break cur_patience = 0
'Neopterin as an indicator of immune activation and prognosis in patients with gynecological malignancies.', text_b=None, label='1') ] use_cuda = False max_seq_length = 50 device = torch.device("cuda") if (use_cuda) else torch.device("cpu") bert_model = 'bert-base-uncased' cache_dir = '/home/dpappas/bert_cache/' bert_tokenizer = BertTokenizer.from_pretrained(bert_model, do_lower_case=True, cache_dir=cache_dir) bert_model = BertForQuestionAnswering.from_pretrained( bert_model, cache_dir=PYTORCH_PRETRAINED_BERT_CACHE / 'distributed_{}'.format(-1)).to(device) eval_features = convert_examples_to_features(eval_examples, max_seq_length, bert_tokenizer) eval_feat = eval_features[0] input_ids = torch.tensor([eval_feat.input_ids], dtype=torch.long).to(device) input_mask = torch.tensor([eval_feat.input_mask], dtype=torch.long).to(device) segment_ids = torch.tensor([eval_feat.segment_ids], dtype=torch.long).to(device) tokens = eval_feat.tokens token_embeds, pooled_output = bert_model.bert(input_ids, segment_ids, input_mask, output_all_encoded_layers=False)
def main(args=None): if args is None: args = model_utils.run_redundancy_span_get_local_args() if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') # logger.info("device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".format( # device, n_gpu, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError("Invalid gradient_accumulation_steps parameter: {}, should be >= 1".format(args.gradient_accumulation_steps)) args.train_batch_size = int(args.train_batch_size / args.gradient_accumulation_steps) random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if not args.do_train and not args.do_predict: raise ValueError("At least one of `do_train` or `do_predict` must be True.") if args.do_train: if not args.train_file: raise ValueError("If `do_train` is True, then `train_file` must be specified.") if args.do_predict: if not args.predict_file: raise ValueError("If `do_predict` is True, then `predict_file` must be specified.") if os.path.exists(args.output_dir) and os.listdir(args.output_dir): raise ValueError("Output directory () already exists and is not empty.") os.makedirs(args.output_dir, exist_ok=True) tokenizer = BertTokenizer.from_pretrained(args.bert_model) train_examples = None num_train_steps = None if args.do_train: train_examples = read_many_examples(input_file=args.train_file, is_training=True) num_train_steps = int(len(train_examples) / args.train_batch_size / args.gradient_accumulation_steps * args.num_train_epochs) # Prepare model """=========fine-tune 1===========""" model = BertForQuestionAnswering.from_pretrained(args.bert_model, cache_dir=PYTORCH_PRETRAINED_BERT_CACHE / 'distributed_{}'.format(args.local_rank)) print (PYTORCH_PRETRAINED_BERT_CACHE / 'distributed_{}'.format(args.local_rank)) if args.fp16: model.half() model.to(device) """==============================""" if args.local_rank != -1: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.") model = DDP(model) elif n_gpu > 1: model = torch.nn.DataParallel(model) # Prepare optimizer param_optimizer = list(model.named_parameters()) # hack to remove pooler, which is not used # thus it produce None grad that break apex param_optimizer = [n for n in param_optimizer if 'pooler' not in n[0]] no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [ {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01}, {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}] t_total = num_train_steps if args.local_rank != -1: t_total = t_total // torch.distributed.get_world_size() if args.fp16: try: from apex.optimizers import FP16_Optimizer from apex.optimizers import FusedAdam except ImportError: raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.") optimizer = FusedAdam(optimizer_grouped_parameters, lr=args.learning_rate, bias_correction=False, max_grad_norm=1.0) if args.loss_scale == 0: optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) else: optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale) else: optimizer = BertAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=t_total) max_eval_accuracy = 0 global_step = 0 if args.do_train: cached_train_features_file = args.train_file + '_{0}_{1}_{2}_{3}'.format( args.bert_model, str(args.max_seq_length), str(args.doc_stride), str(args.max_query_length)) train_features = None try: with open(cached_train_features_file, "rb") as reader: train_features = pickle.load(reader) except: train_features = convert_examples_to_features( examples=train_examples, tokenizer=tokenizer, max_seq_length=args.max_seq_length, doc_stride=args.doc_stride, max_query_length=args.max_query_length, is_training=True) if args.local_rank == -1 or torch.distributed.get_rank() == 0: # logger.info(" Saving train features into cached file %s", cached_train_features_file) with open(cached_train_features_file, "wb") as writer: pickle.dump(train_features, writer) all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long) all_start_positions = torch.tensor([f.start_position for f in train_features], dtype=torch.long) all_end_positions = torch.tensor([f.end_position for f in train_features], dtype=torch.long) train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_start_positions, all_end_positions) if args.local_rank == -1: train_sampler = RandomSampler(train_data) else: train_sampler = DistributedSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) if args.do_predict and (args.local_rank == -1 or torch.distributed.get_rank() == 0): eval_examples = read_many_examples(input_file=args.predict_file, is_training=False) eval_features = convert_examples_to_features( examples=eval_examples, tokenizer=tokenizer, max_seq_length=args.max_seq_length, doc_stride=args.doc_stride, max_query_length=args.max_query_length, is_training=False) all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long) # all_start_positions = torch.tensor([f.start_position for f in train_features], dtype=torch.long) # all_end_positions = torch.tensor([f.end_position for f in train_features], dtype=torch.long) all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_example_index) # Run prediction for full data eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.predict_batch_size) for _ in trange(int(args.num_train_epochs), desc="Epoch"): model.train() for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration")): if n_gpu == 1: batch = tuple(t.to(device) for t in batch) # multi-gpu does scattering it-self input_ids, input_mask, segment_ids, start_positions, end_positions = batch loss = model(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask, start_positions=start_positions, end_positions=end_positions) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: optimizer.backward(loss) else: loss.backward(retain_graph=True) if (step + 1) % args.gradient_accumulation_steps == 0: # modify learning rate with special warm up BERT uses lr_this_step = args.learning_rate * span_utils.warmup_linear(global_step / t_total, args.warmup_proportion) for param_group in optimizer.param_groups: param_group['lr'] = lr_this_step optimizer.step() optimizer.zero_grad() global_step += 1 if args.do_predict and (args.local_rank == -1 or torch.distributed.get_rank() == 0): model.eval() all_results = [] # logger.info("Start evaluating") for input_ids, input_mask, segment_ids, example_indices, in tqdm(eval_dataloader, desc="Evaluating"): # if len(all_results) % 1000 == 0: logger.info("Processing example: %d" % (len(all_results))) input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) with torch.no_grad(): batch_start_logits, batch_end_logits = model(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask) for i, example_index in enumerate(example_indices): start_logits = batch_start_logits[i].detach().cpu().tolist() end_logits = batch_end_logits[i].detach().cpu().tolist() eval_feature = eval_features[example_index.item()] # unique_id = int(eval_feature.unique_id) all_results.append(RawResult(unique_id=int(eval_feature.unique_id), start_logits=start_logits, end_logits=end_logits)) eval_accuracy,instance_num, all_predictions = write_predictions(eval_examples, eval_features, all_results, args.n_best_size, args.max_answer_length, args.do_lower_case, args.verbose_logging) if max_eval_accuracy < eval_accuracy: max_eval_accuracy = eval_accuracy # Save a trained model model_to_save = model.module if hasattr(model, 'module') else model # Only save the model it-self output_model_file = os.path.join(args.output_dir, "pytorch_model.bin") torch.save(model_to_save.state_dict(), output_model_file) output_prediction_file = os.path.join(args.output_dir, "predictions.json") output_nbest_file = os.path.join(args.output_dir, "nbest_predictions.json") with open(output_prediction_file, "w") as writer: writer.write(json.dumps(all_predictions, indent=4) + "\n") # with open(output_nbest_file, "w") as writer: # writer.write(json.dumps(all_nbest_json, indent=4) + "\n") result_json = collections.OrderedDict() result_json['result'] = eval_accuracy result_json['eval_accuracy'] = eval_accuracy result_json['instance_num'] = instance_num with open(output_nbest_file, "w") as writer: writer.write(json.dumps(result_json, indent=4) + "\n")
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument( "--bert_model", default=None, type=str, required=True, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese." ) parser.add_argument( "--output_dir", default=None, type=str, required=True, help="The output directory where the model checkpoints will be written." ) ## Other parameters parser.add_argument("--train_file", default=None, type=str, help="SQuAD json for training. E.g., train-v1.1.json") parser.add_argument( "--predict_file", default=None, type=str, help="SQuAD json for predictions. E.g., dev-v1.1.json or test-v1.1.json" ) parser.add_argument( "--max_seq_length", default=384, type=int, help= "The maximum total input sequence length after WordPiece tokenization. Sequences " "longer than this will be truncated, and sequences shorter than this will be padded." ) parser.add_argument( "--doc_stride", default=128, type=int, help= "When splitting up a long document into chunks, how much stride to take between chunks." ) parser.add_argument( "--max_query_length", default=64, type=int, help= "The maximum number of tokens for the question. Questions longer than this will " "be truncated to this length.") parser.add_argument("--do_train", default=False, action='store_true', help="Whether to run training.") parser.add_argument("--do_predict", default=False, action='store_true', help="Whether to run eval on the dev set.") parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.") parser.add_argument("--predict_batch_size", default=8, type=int, help="Total batch size for predictions.") parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument( "--warmup_proportion", default=0.1, type=float, help= "Proportion of training to perform linear learning rate warmup for. E.g., 0.1 = 10% " "of training.") parser.add_argument( "--n_best_size", default=20, type=int, help= "The total number of n-best predictions to generate in the nbest_predictions.json " "output file.") parser.add_argument( "--max_answer_length", default=30, type=int, help= "The maximum length of an answer that can be generated. This is needed because the start " "and end predictions are not conditioned on one another.") parser.add_argument( "--verbose_logging", default=False, action='store_true', help= "If true, all of the warnings related to data processing will be printed. " "A number of warnings are expected for a normal SQuAD evaluation.") parser.add_argument("--no_cuda", default=False, action='store_true', help="Whether not to use CUDA when available") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument( '--gradient_accumulation_steps', type=int, default=1, help= "Number of updates steps to accumulate before performing a backward/update pass." ) parser.add_argument( "--do_lower_case", default=True, action='store_true', help= "Whether to lower case the input text. True for uncased models, False for cased models." ) parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument( '--optimize_on_cpu', default=False, action='store_true', help= "Whether to perform optimization and keep the optimizer averages on CPU" ) parser.add_argument( '--fp16', default=False, action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument( '--loss_scale', type=float, default=128, help= 'Loss scaling, positive power of 2 values can improve fp16 convergence.' ) args = parser.parse_args() eval_period = 1200 if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') if args.fp16: logger.info( "16-bits training currently not supported in distributed training" ) args.fp16 = False # (see https://github.com/pytorch/pytorch/pull/13496) logger.info( "device: {} n_gpu: {}, distributed training: {}, 16-bits trainiing: {}" .format(device, n_gpu, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(args.gradient_accumulation_steps)) args.train_batch_size = int(args.train_batch_size / args.gradient_accumulation_steps) random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if not args.do_train and not args.do_predict: raise ValueError( "At least one of `do_train` or `do_predict` must be True.") if args.do_train: if not args.train_file: raise ValueError( "If `do_train` is True, then `train_file` must be specified.") if args.do_predict: if not args.predict_file: raise ValueError( "If `do_predict` is True, then `predict_file` must be specified." ) if os.path.exists(args.output_dir) and os.listdir(args.output_dir): raise ValueError( "Output directory () already exists and is not empty.") os.makedirs(args.output_dir, exist_ok=True) tokenizer = BertTokenizer.from_pretrained(args.bert_model) train_examples = None num_train_steps = None if args.do_train: train_examples = read_squad_examples(input_file=args.train_file, is_training=True) num_train_steps = int( len(train_examples) / args.train_batch_size / args.gradient_accumulation_steps * args.num_train_epochs) # Prepare model model = BertForQuestionAnswering.from_pretrained(args.bert_model) if args.fp16: model.half() model.to(device) if args.local_rank != -1: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.local_rank], output_device=args.local_rank) elif n_gpu > 1: model = torch.nn.DataParallel(model) # Prepare optimizer if args.fp16: param_optimizer = [(n, param.clone().detach().to('cpu').float().requires_grad_()) \ for n, param in model.named_parameters()] elif args.optimize_on_cpu: param_optimizer = [(n, param.clone().detach().to('cpu').requires_grad_()) \ for n, param in model.named_parameters()] else: param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'gamma', 'beta'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay_rate': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay_rate': 0.0 }] t_total = num_train_steps if args.local_rank != -1: t_total = t_total // torch.distributed.get_world_size() optimizer = BertAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=t_total) global_step = 0 save_path_ls = [] if args.do_train: cached_train_features_file = args.train_file + '_{0}_{1}_{2}_{3}'.format( args.bert_model, str(args.max_seq_length), str(args.doc_stride), str(args.max_query_length)) train_features = None try: with open(cached_train_features_file, "rb") as reader: train_features = pickle.load(reader) except: train_features = convert_examples_to_features( examples=train_examples, tokenizer=tokenizer, max_seq_length=args.max_seq_length, doc_stride=args.doc_stride, max_query_length=args.max_query_length, is_training=True) if args.local_rank == -1 or torch.distributed.get_rank() == 0: logger.info(" Saving train features into cached file %s", cached_train_features_file) with open(cached_train_features_file, "wb") as writer: pickle.dump(train_features, writer) logger.info("***** Running training *****") logger.info(" Num orig examples = %d", len(train_examples)) logger.info(" Num split examples = %d", len(train_features)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_steps) all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long) all_start_positions = torch.tensor( [f.start_position for f in train_features], dtype=torch.long) all_end_positions = torch.tensor( [f.end_position for f in train_features], dtype=torch.long) train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_start_positions, all_end_positions) if args.local_rank == -1: train_sampler = RandomSampler(train_data) else: train_sampler = DistributedSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) model.train() for _ in trange(int(args.num_train_epochs), desc="Epoch"): for step, batch in enumerate( tqdm(train_dataloader, desc="Iteration")): if n_gpu == 1: batch = tuple( t.to(device) for t in batch) # multi-gpu does scattering it-self input_ids, input_mask, segment_ids, start_positions, end_positions = batch loss = model(input_ids, segment_ids, input_mask, start_positions, end_positions) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.fp16 and args.loss_scale != 1.0: # rescale loss for fp16 training # see https://docs.nvidia.com/deeplearning/sdk/mixed-precision-training/index.html loss = loss * args.loss_scale if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps loss.backward() if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16 or args.optimize_on_cpu: if args.fp16 and args.loss_scale != 1.0: # scale down gradients for fp16 training for param in model.parameters(): if param.grad is not None: param.grad.data = param.grad.data / args.loss_scale is_nan = set_optimizer_params_grad( param_optimizer, model.named_parameters(), test_nan=True) if is_nan: logger.info( "FP16 TRAINING: Nan in gradients, reducing loss scaling" ) args.loss_scale = args.loss_scale / 2 model.zero_grad() continue optimizer.step() copy_optimizer_params_to_model( model.named_parameters(), param_optimizer) else: optimizer.step() model.zero_grad() global_step += 1 print("Saving model...") save_path = os.path.join(args.output_dir, "step_" + str(global_step) + ".pth") torch.save(model.state_dict(), save_path) save_path_ls.append(save_path) if args.do_predict and (args.local_rank == -1 or torch.distributed.get_rank() == 0): eval_examples = read_squad_examples(input_file=args.predict_file, is_training=False) eval_features = convert_examples_to_features( examples=eval_examples, tokenizer=tokenizer, max_seq_length=args.max_seq_length, doc_stride=args.doc_stride, max_query_length=args.max_query_length, is_training=False) logger.info("***** Running predictions *****") logger.info(" Num orig examples = %d", len(eval_examples)) logger.info(" Num split examples = %d", len(eval_features)) logger.info(" Batch size = %d", args.predict_batch_size) all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long) all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_example_index) # Run prediction for full data eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.predict_batch_size) for save_dir in save_path_ls: model.load_state_dict(torch.load(save_dir)) model_name = save_dir.split('/')[-1] step_name = model_name[:-4] output_path = os.path.join(args.output_dir, step_name) if not os.path.exists(output_path): os.makedirs(output_path) model.eval() all_results = [] logger.info("Start evaluating") for input_ids, input_mask, segment_ids, example_indices in tqdm( eval_dataloader, desc="Evaluating"): if len(all_results) % 1000 == 0: logger.info("Processing example: %d" % (len(all_results))) input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) with torch.no_grad(): batch_start_logits, batch_end_logits = model( input_ids, segment_ids, input_mask) for i, example_index in enumerate(example_indices): start_logits = batch_start_logits[i].detach().cpu().tolist( ) end_logits = batch_end_logits[i].detach().cpu().tolist() eval_feature = eval_features[example_index.item()] unique_id = int(eval_feature.unique_id) all_results.append( RawResult(unique_id=unique_id, start_logits=start_logits, end_logits=end_logits)) output_prediction_file = os.path.join(output_path, "predictions.json") output_nbest_file = os.path.join(output_path, "nbest_predictions.json") write_predictions(eval_examples, eval_features, all_results, args.n_best_size, args.max_answer_length, args.do_lower_case, output_prediction_file, output_nbest_file, args.verbose_logging)
def main(): parser = argparse.ArgumentParser() parser.add_argument("--paragraph", default=None, type=str) parser.add_argument("--model", default=None, type=str) parser.add_argument("--max_seq_length", default=384, type=int) parser.add_argument("--doc_stride", default=128, type=int) parser.add_argument("--max_query_length", default=64, type=int) parser.add_argument("--config_file", default=None, type=str) parser.add_argument("--max_answer_length", default=30, type=int) args = parser.parse_args() para_file = args.paragraph model_path = args.model device = torch.device("cuda" if torch.cuda.is_available() else "cpu") print(device) n_gpu = torch.cuda.device_count() ### Raeding paragraph f = open(para_file, 'r') para = f.read() f.close() ## Reading question # f = open(ques_file, 'r') # ques = f.read() # f.close() para_list = para.split('\n\n') input_data = [] i = 1 for para in para_list: paragraphs = {} splits = para.split('\nQuestions:') paragraphs['id'] = i paragraphs['text'] = splits[0].replace('Paragraph:', '').strip('\n') paragraphs['ques'] = splits[1].lstrip('\n').split('\n') input_data.append(paragraphs) i += 1 ## input_data is a list of dictionary which has a paragraph and questions examples = read_squad_examples(input_data) tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True) eval_features = convert_examples_to_features( examples=examples, tokenizer=tokenizer, max_seq_length=args.max_seq_length, doc_stride=args.doc_stride, max_query_length=args.max_query_length) all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long) all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long) ### Loading Pretrained model for QnA config = BertConfig(args.config_file) model = BertForQuestionAnswering(config) model.load_state_dict( torch.load(model_path, map_location=torch.device('cpu'))) model.to(device) pred_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_example_index) # Run prediction for full data pred_sampler = SequentialSampler(pred_data) pred_dataloader = DataLoader(pred_data, sampler=pred_sampler, batch_size=9) predictions = [] for input_ids, input_mask, segment_ids, example_indices in tqdm( pred_dataloader): input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) with torch.no_grad(): batch_start_logits, batch_end_logits = model( input_ids, segment_ids, input_mask) features = [] example = [] all_results = [] for i, example_index in enumerate(example_indices): start_logits = batch_start_logits[i].detach().cpu().tolist() end_logits = batch_end_logits[i].detach().cpu().tolist() feature = eval_features[example_index.item()] unique_id = int(feature.unique_id) features.append(feature) all_results.append( RawResult(unique_id=unique_id, start_logits=start_logits, end_logits=end_logits)) output = predict(examples, features, all_results, args.max_answer_length) predictions.append(output) ### For printing the results #### index = None for example in examples: if index != example.example_id: print(example.para_text) index = example.example_id print('\n') print( colored('***********Question and Answers *************', 'red')) ques_text = colored(example.question_text, 'blue') print(ques_text) prediction = colored(predictions[math.floor(example.unique_id / 12)][example], 'green', attrs=['reverse', 'blink']) print(prediction) print('\n')