def _simpQA(data_list, args, rc_args, tokenizer, model, device): tempfile = f'{args.data_dir}/temp_{random.randint(0,1000)}.json' write_file({'data': data_list}, tempfile) eval_dataloader, eval_examples, eval_features = get_dataloader( logger, args=rc_args, input_file=tempfile, batch_size=args.predict_batch_size, tokenizer=tokenizer) os.remove(tempfile) preds, nbest = predict(rc_args, model, eval_dataloader, eval_examples, eval_features, device) id_preds, id_text_n_logit = {}, {} for key, val in preds.items(): id_preds[key] = val[0] for key, val in nbest.items(): id_text_n_logit[key] = {x['text']: x['logit'] for x in val} return id_preds, id_text_n_logit
def main(): parser = argparse.ArgumentParser() BERT_DIR = "./model/uncased_L-12_H-768_A-12/" ## Required parameters parser.add_argument("--bert_config_file", default=BERT_DIR+"bert_config.json", \ type=str, help="The config json file corresponding to the pre-trained BERT model. " "This specifies the model architecture.") parser.add_argument("--vocab_file", default=BERT_DIR+"vocab.txt", type=str, \ help="The vocabulary file that the BERT model was trained on.") parser.add_argument("--output_dir", default="out", type=str, \ help="The output directory where the model checkpoints will be written.") ## Other parameters parser.add_argument("--train_file", type=str, \ help="SQuAD json for training. E.g., train-v1.1.json", \ default="") parser.add_argument("--predict_file", type=str, help="SQuAD json for predictions. E.g., dev-v1.1.json or test-v1.1.json", \ default="") parser.add_argument("--init_checkpoint", type=str, help="Initial checkpoint (usually from a pre-trained BERT model).", \ default=BERT_DIR+"pytorch_model.bin") parser.add_argument( "--do_lower_case", default=True, action='store_true', help="Whether to lower case the input text. Should be True for uncased " "models and False for cased models.") parser.add_argument( "--max_seq_length", default=300, type=int, help= "The maximum total input sequence length after WordPiece tokenization. Sequences " "longer than this will be truncated, and sequences shorter than this will be padded." ) parser.add_argument( "--doc_stride", default=128, type=int, help= "When splitting up a long document into chunks, how much stride to take between chunks." ) parser.add_argument( "--max_query_length", default=64, type=int, help= "The maximum number of tokens for the question. Questions longer than this will " "be truncated to this length.") parser.add_argument("--do_train", default=False, action='store_true', help="Whether to run training.") parser.add_argument("--do_predict", default=False, action='store_true', help="Whether to run eval on the dev set.") parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.") parser.add_argument("--predict_batch_size", default=128, type=int, help="Total batch size for predictions.") parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--num_train_epochs", default=10.0, type=float, help="Total number of training epochs to perform.") parser.add_argument( "--warmup_proportion", default=0.1, type=float, help= "Proportion of training to perform linear learning rate warmup for. E.g., 0.1 = 10% " "of training.") parser.add_argument("--save_checkpoints_steps", default=1000, type=int, help="How often to save the model checkpoint.") parser.add_argument("--iterations_per_loop", default=1000, type=int, help="How many steps to make in each estimator call.") parser.add_argument( "--n_best_size", default=3, type=int, help= "The total number of n-best predictions to generate in the nbest_predictions.json " "output file.") parser.add_argument( "--max_answer_length", default=30, type=int, help= "The maximum length of an answer that can be generated. This is needed because the start " "and end predictions are not conditioned on one another.") parser.add_argument( "--verbose_logging", default=False, action='store_true', help= "If true, all of the warnings related to data processing will be printed. " "A number of warnings are expected for a normal SQuAD evaluation.") parser.add_argument("--no_cuda", default=False, action='store_true', help="Whether not to use CUDA when available") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument( "--accumulate_gradients", type=int, default=1, help= "Number of steps to accumulate gradient on (divide the batch_size and accumulate)" ) parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument( '--gradient_accumulation_steps', type=int, default=1, help= "Number of updates steps to accumualte before performing a backward/update pass." ) parser.add_argument('--eval_period', type=int, default=2000) parser.add_argument('--max_n_answers', type=int, default=5) parser.add_argument('--merge_query', type=int, default=-1) parser.add_argument('--reduce_layers', type=int, default=-1) parser.add_argument('--reduce_layers_to_tune', type=int, default=-1) parser.add_argument('--only_comp', action="store_true", default=False) parser.add_argument('--train_subqueries_file', type=str, default="") #500 parser.add_argument('--predict_subqueries_file', type=str, default="") #500 parser.add_argument('--prefix', type=str, default="") #500 parser.add_argument('--model', type=str, default="qa") #500 parser.add_argument('--pooling', type=str, default="max") parser.add_argument('--debug', action="store_true", default=False) parser.add_argument('--output_dropout_prob', type=float, default=0) parser.add_argument('--wait_step', type=int, default=30) parser.add_argument('--with_key', action="store_true", default=False) parser.add_argument('--add_noise', action="store_true", default=False) args = parser.parse_args() if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logger.info("device %s n_gpu %d distributed training %r", device, n_gpu, bool(args.local_rank != -1)) if args.accumulate_gradients < 1: raise ValueError( "Invalid accumulate_gradients parameter: {}, should be >= 1". format(args.accumulate_gradients)) args.train_batch_size = int(args.train_batch_size / args.accumulate_gradients) random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if not args.do_train and not args.do_predict: raise ValueError( "At least one of `do_train` or `do_predict` must be True.") if args.do_train: if not args.train_file: raise ValueError( "If `do_train` is True, then `train_file` must be specified.") if not args.predict_file: raise ValueError( "If `do_train` is True, then `predict_file` must be specified." ) if args.do_predict: if not args.predict_file: raise ValueError( "If `do_predict` is True, then `predict_file` must be specified." ) bert_config = BertConfig.from_json_file(args.bert_config_file) if args.do_train and args.max_seq_length > bert_config.max_position_embeddings: raise ValueError( "Cannot use sequence length %d because the BERT model " "was only trained up to sequence length %d" % (args.max_seq_length, bert_config.max_position_embeddings)) if os.path.exists(args.output_dir) and os.listdir(args.output_dir): logger.info("Output directory () already exists and is not empty.") if not os.path.exists(args.output_dir): os.makedirs(args.output_dir, exist_ok=True) tokenizer = tokenization.FullTokenizer(vocab_file=args.vocab_file, do_lower_case=args.do_lower_case) train_examples = None num_train_steps = None eval_dataloader, eval_examples, eval_features, _ = get_dataloader( logger=logger, args=args, input_file=args.predict_file, subqueries_file=args.predict_subqueries_file, is_training=False, batch_size=args.predict_batch_size, num_epochs=1, tokenizer=tokenizer) if args.do_train: train_dataloader, train_examples, _, num_train_steps = get_dataloader( logger=logger, args=args, \ input_file=args.train_file, \ subqueries_file=args.train_subqueries_file, \ is_training=True, batch_size=args.train_batch_size, num_epochs=args.num_train_epochs, tokenizer=tokenizer) #a = input() if args.model == 'qa': model = BertForQuestionAnswering(bert_config, 4) metric_name = "F1" elif args.model == 'classifier': if args.reduce_layers != -1: bert_config.num_hidden_layers = args.reduce_layers model = BertClassifier(bert_config, 2, args.pooling) metric_name = "F1" elif args.model == "span-predictor": if args.reduce_layers != -1: bert_config.num_hidden_layers = args.reduce_layers if args.with_key: Model = BertForQuestionAnsweringWithKeyword else: Model = BertForQuestionAnswering model = Model(bert_config, 2) metric_name = "Accuracy" else: raise NotImplementedError() if args.init_checkpoint is not None and args.do_predict and \ len(args.init_checkpoint.split(','))>1: assert args.model == "qa" model = [model] for i, checkpoint in enumerate(args.init_checkpoint.split(',')): if i > 0: model.append(BertForQuestionAnswering(bert_config, 4)) print("Loading from", checkpoint) state_dict = torch.load(checkpoint, map_location='cpu') filter = lambda x: x[7:] if x.startswith('module.') else x state_dict = {filter(k): v for (k, v) in state_dict.items()} model[-1].load_state_dict(state_dict) model[-1].to(device) else: if args.init_checkpoint is not None: print("Loading from", args.init_checkpoint) state_dict = torch.load(args.init_checkpoint, map_location='cpu') if args.reduce_layers != -1: state_dict = {k:v for k, v in state_dict.items() \ if not '.'.join(k.split('.')[:3]) in \ ['encoder.layer.{}'.format(i) for i in range(args.reduce_layers, 12)]} if args.do_predict: filter = lambda x: x[7:] if x.startswith('module.') else x state_dict = {filter(k): v for (k, v) in state_dict.items()} model.load_state_dict(state_dict) else: model.bert.load_state_dict(state_dict) if args.reduce_layers_to_tune != -1: model.bert.embeddings.required_grad = False n_layers = 12 if args.reduce_layers == -1 else args.reduce_layers for i in range(n_layers - args.reduce_layers_to_tune): model.bert.encoder.layer[i].require_grad = False model.to(device) if args.local_rank != -1: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.local_rank], output_device=args.local_rank) elif n_gpu > 1: model = torch.nn.DataParallel(model) if args.do_train: no_decay = ['bias', 'gamma', 'beta'] optimizer_parameters = [{ 'params': [p for n, p in model.named_parameters() if n not in no_decay], 'weight_decay_rate': 0.01 }, { 'params': [p for n, p in model.named_parameters() if n in no_decay], 'weight_decay_rate': 0.0 }] optimizer = BERTAdam(optimizer_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=num_train_steps) global_step = 0 best_f1 = 0 wait_step = 0 model.train() global_step = 0 stop_training = False for epoch in range(int(args.num_train_epochs)): for step, batch in tqdm(enumerate(train_dataloader)): global_step += 1 batch = [t.to(device) for t in batch] loss = model(batch, global_step) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps loss.backward() if global_step % args.gradient_accumulation_steps == 0: optimizer.step() # We have accumulated enought gradients model.zero_grad() if global_step % args.eval_period == 0: model.eval() f1 = predict(args, model, eval_dataloader, eval_examples, eval_features, \ device, write_prediction=False) logger.info("%s: %.3f on epoch=%d" % (metric_name, f1 * 100.0, epoch)) if best_f1 < f1: logger.info("Saving model with best %s: %.3f -> %.3f on epoch=%d" % \ (metric_name, best_f1*100.0, f1*100.0, epoch)) model_state_dict = { k: v.cpu() for (k, v) in model.state_dict().items() } torch.save( model_state_dict, os.path.join(args.output_dir, "best-model.pt")) model = model.cuda() best_f1 = f1 wait_step = 0 stop_training = False else: wait_step += 1 if best_f1 > 0.1 and wait_step == args.wait_step: stop_training = True model.train() if stop_training: break elif args.do_predict: if type(model) == list: model = [m.eval() for m in model] else: model.eval() f1 = predict(args, model, eval_dataloader, eval_examples, eval_features, device) logger.info("Final %s score: %.3f%%" % (metric_name, f1 * 100.0))
def main(): parser = argparse.ArgumentParser() BERT_DIR = "uncased_L-12_H-768_A-12/" ## Required parameters parser.add_argument("--bert_config_file", default=BERT_DIR+"bert_config.json", \ type=str, help="The config json file corresponding to the pre-trained BERT model. " "This specifies the model architecture.") parser.add_argument("--vocab_file", default=BERT_DIR+"vocab.txt", type=str, \ help="The vocabulary file that the BERT model was trained on.") parser.add_argument("--output_dir", default="out", type=str, \ help="The output directory where the model checkpoints will be written.") ## Other parameters parser.add_argument("--load", default=False, action='store_true') parser.add_argument("--train_file", type=str, \ help="SQuAD json for training. E.g., train-v1.1.json", \ default="/home/sewon/data/squad/train-v1.1.json") parser.add_argument("--predict_file", type=str, help="SQuAD json for predictions. E.g., dev-v1.1.json or test-v1.1.json", \ default="/home/sewon/data/squad/dev-v1.1.json") parser.add_argument("--init_checkpoint", type=str, help="Initial checkpoint (usually from a pre-trained BERT model).", \ default=BERT_DIR+"pytorch_model.bin") parser.add_argument( "--do_lower_case", default=True, action='store_true', help="Whether to lower case the input text. Should be True for uncased " "models and False for cased models.") parser.add_argument( "--max_seq_length", default=300, type=int, help= "The maximum total input sequence length after WordPiece tokenization. Sequences " "longer than this will be truncated, and sequences shorter than this will be padded." ) parser.add_argument( "--doc_stride", default=128, type=int, help= "When splitting up a long document into chunks, how much stride to take between chunks." ) parser.add_argument( "--max_query_length", default=64, type=int, help= "The maximum number of tokens for the question. Questions longer than this will " "be truncated to this length.") parser.add_argument("--do_train", default=False, action='store_true', help="Whether to run training.") parser.add_argument("--do_predict", default=False, action='store_true', help="Whether to run eval on the dev set.") parser.add_argument("--train_batch_size", default=39, type=int, help="Total batch size for training.") parser.add_argument("--predict_batch_size", default=300, type=int, help="Total batch size for predictions.") parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--num_train_epochs", default=1000.0, type=float, help="Total number of training epochs to perform.") parser.add_argument( "--warmup_proportion", default=0.1, type=float, help= "Proportion of training to perform linear learning rate warmup for. E.g., 0.1 = 10% " "of training.") parser.add_argument("--save_checkpoints_steps", default=1000, type=int, help="How often to save the model checkpoint.") parser.add_argument("--iterations_per_loop", default=1000, type=int, help="How many steps to make in each estimator call.") parser.add_argument( "--n_best_size", default=3, type=int, help= "The total number of n-best predictions to generate in the nbest_predictions.json " "output file.") parser.add_argument( "--verbose_logging", default=False, action='store_true', help= "If true, all of the warnings related to data processing will be printed. " "A number of warnings are expected for a normal SQuAD evaluation.") parser.add_argument("--no_cuda", default=False, action='store_true', help="Whether not to use CUDA when available") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument( "--accumulate_gradients", type=int, default=1, help= "Number of steps to accumulate gradient on (divide the batch_size and accumulate)" ) parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument( '--gradient_accumulation_steps', type=int, default=1, help= "Number of updates steps to accumualte before performing a backward/update pass." ) parser.add_argument('--eval_period', type=int, default=500) parser.add_argument('--max_n_answers', type=int, default=20) parser.add_argument('--n_paragraphs', type=str, default='40') parser.add_argument('--verbose', action="store_true", default=False) parser.add_argument('--wait_step', type=int, default=12) # Learning method variation parser.add_argument('--loss_type', type=str, default="mml") parser.add_argument('--tau', type=float, default=12000.0) # For evaluation parser.add_argument('--prefix', type=str, default="") #500 parser.add_argument('--debug', action="store_true", default=False) args = parser.parse_args() if os.path.exists(args.output_dir) and os.listdir(args.output_dir): print("Output directory () already exists and is not empty.") if not os.path.exists(args.output_dir): os.makedirs(args.output_dir, exist_ok=True) logging.basicConfig( format='%(asctime)s - %(levelname)s - %(name)s - %(message)s', datefmt='%m/%d/%Y %H:%M:%S', level=logging.INFO, handlers=[ logging.FileHandler(os.path.join(args.output_dir, "log.txt")), logging.StreamHandler() ]) logger = logging.getLogger(__name__) logger.info(args) if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logger.info("device %s n_gpu %d distributed training %r", device, n_gpu, bool(args.local_rank != -1)) if args.accumulate_gradients < 1: raise ValueError( "Invalid accumulate_gradients parameter: {}, should be >= 1". format(args.accumulate_gradients)) args.train_batch_size = int(args.train_batch_size / args.accumulate_gradients) random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if not args.do_train and not args.do_predict: raise ValueError( "At least one of `do_train` or `do_predict` must be True.") if args.do_train: if not args.train_file: raise ValueError( "If `do_train` is True, then `train_file` must be specified.") if not args.predict_file: raise ValueError( "If `do_train` is True, then `predict_file` must be specified." ) if args.do_predict: if not args.predict_file: raise ValueError( "If `do_predict` is True, then `predict_file` must be specified." ) bert_config = BertConfig.from_json_file(args.bert_config_file) if args.do_train and args.max_seq_length > bert_config.max_position_embeddings: raise ValueError( "Cannot use sequence length %d because the BERT model " "was only trained up to sequence length %d" % (args.max_seq_length, bert_config.max_position_embeddings)) model = BertForQuestionAnswering(bert_config, device, 4, loss_type=args.loss_type, tau=args.tau) metric_name = "EM" tokenizer = tokenization.FullTokenizer(vocab_file=args.vocab_file, do_lower_case=args.do_lower_case) train_examples = None num_train_steps = None train_split = ',' in args.train_file if train_split: n_train_files = len(args.train_file.split(',')) eval_dataloader, eval_examples, eval_features, _ = get_dataloader( logger=logger, args=args, input_file=args.predict_file, is_training=False, batch_size=args.predict_batch_size, num_epochs=1, tokenizer=tokenizer) if args.do_train: train_file = args.train_file if train_split: train_file = args.train_file.split(',')[0] train_dataloader, _, _, num_train_steps = get_dataloader( logger=logger, args=args, \ input_file=train_file, \ is_training=True, batch_size=args.train_batch_size, num_epochs=args.num_train_epochs, tokenizer=tokenizer) if args.init_checkpoint is not None: logger.info("Loading from {}".format(args.init_checkpoint)) state_dict = torch.load(args.init_checkpoint, map_location='cpu') if args.do_train and args.init_checkpoint.endswith( 'pytorch_model.bin'): model.bert.load_state_dict(state_dict) else: filter = lambda x: x[7:] if x.startswith('module.') else x state_dict = {filter(k): v for (k, v) in state_dict.items()} model.load_state_dict(state_dict) model.to(device) if args.local_rank != -1: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.local_rank], output_device=args.local_rank) elif n_gpu > 1: model = torch.nn.DataParallel(model) if args.do_train: no_decay = ['bias', 'gamma', 'beta'] optimizer_parameters = [{ 'params': [p for n, p in model.named_parameters() if n not in no_decay], 'weight_decay_rate': 0.01 }, { 'params': [p for n, p in model.named_parameters() if n in no_decay], 'weight_decay_rate': 0.0 }] optimizer = BERTAdam(optimizer_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=num_train_steps) global_step = 0 best_f1 = (-1, -1) wait_step = 0 model.train() global_step = 0 stop_training = False train_losses = [] for epoch in range(int(args.num_train_epochs)): if epoch > 0 and train_split: train_file = args.train_file.split(',')[epoch % n_train_files] train_dataloader = get_dataloader( logger=logger, args=args, \ input_file=train_file, \ is_training=True, batch_size=args.train_batch_size, num_epochs=args.num_train_epochs, tokenizer=tokenizer)[0] for step, batch in enumerate(train_dataloader): global_step += 1 batch = [t.to(device) for t in batch] loss = model(batch, global_step) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps train_losses.append(loss.detach().cpu()) loss.backward() if global_step % args.gradient_accumulation_steps == 0: optimizer.step() # We have accumulated enought gradients model.zero_grad() if global_step % args.eval_period == 0: model.eval() f1 = predict(logger, args, model, eval_dataloader, eval_examples, eval_features, \ device, write_prediction=False) logger.info( "Step %d Train loss %.2f EM %.2f F1 %.2f on epoch=%d" % (global_step, np.mean(train_losses), f1[0] * 100, f1[1] * 100, epoch)) train_losses = [] if best_f1 < f1: logger.info("Saving model with best %s: %.2f (F1 %.2f) -> %.2f (F1 %.2f) on epoch=%d" % \ (metric_name, best_f1[0]*100, best_f1[1]*100, f1[0]*100, f1[1]*100, epoch)) model_state_dict = { k: v.cpu() for (k, v) in model.state_dict().items() } torch.save( model_state_dict, os.path.join(args.output_dir, "best-model.pt")) model = model.to(device) best_f1 = f1 wait_step = 0 stop_training = False else: wait_step += 1 if wait_step == args.wait_step: stop_training = True model.train() if stop_training: break logger.info("Training finished!") elif args.do_predict: if type(model) == list: model = [m.eval() for m in model] else: model.eval() f1 = predict(logger, args, model, eval_dataloader, eval_examples, eval_features, device, varying_n_paragraphs=len(args.n_paragraphs) > 1)