def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument( "--data_dir", default=None, type=str, required=True, help= "The input data dir. Should contain the .csv files (or other data files) for the task." ) parser.add_argument( "--bert_model", default=None, type=str, required=True, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, " "bert-base-multilingual-cased, bert-base-chinese.") parser.add_argument( "--output_dir", default=None, type=str, required=True, help="The output directory where the model checkpoints will be written." ) parser.add_argument("--init_checkpoint", default=None, type=str, required=True, help="The checkpoint file from pretraining") ## Other parameters parser.add_argument( "--max_seq_length", default=128, type=int, help= "The maximum total input sequence length after WordPiece tokenization. \n" "Sequences longer than this will be truncated, and sequences shorter \n" "than this will be padded.") parser.add_argument("--do_train", action='store_true', help="Whether to run training.") parser.add_argument("--do_eval", action='store_true', help="Whether to run eval on the dev set.") parser.add_argument( "--do_lower_case", action='store_true', help="Set this flag if you are using an uncased model.") parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.") parser.add_argument("--eval_batch_size", default=8, type=int, help="Total batch size for eval.") parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument("--max_steps", default=-1.0, type=float, help="Total number of training steps to perform.") parser.add_argument( "--warmup_proportion", default=0.1, type=float, help= "Proportion of training to perform linear learning rate warmup for. " "E.g., 0.1 = 10%% of training.") parser.add_argument("--no_cuda", action='store_true', help="Whether not to use CUDA when available") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument( '--gradient_accumulation_steps', type=int, default=1, help= "Number of updates steps to accumulate before performing a backward/update pass." ) parser.add_argument( '--fp16', action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument( '--loss_scale', type=float, default=0, help= "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.\n") args = parser.parse_args() if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logger.info( "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}". format(device, n_gpu, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(args.gradient_accumulation_steps)) args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if not args.do_train and not args.do_eval: raise ValueError( "At least one of `do_train` or `do_eval` must be True.") if os.path.exists(args.output_dir) and os.listdir(args.output_dir): print( "WARNING: Output directory ({}) already exists and is not empty.". format(args.output_dir)) if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) train_examples = None num_train_optimization_steps = None if args.do_train: train_examples = read_swag_examples(os.path.join( args.data_dir, 'train.csv'), is_training=True) num_train_optimization_steps = int( len(train_examples) / args.train_batch_size / args.gradient_accumulation_steps) * args.num_train_epochs if args.local_rank != -1: num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size( ) # Prepare model model = BertForMultipleChoice.from_pretrained( args.bert_model, cache_dir=os.path.join(PYTORCH_PRETRAINED_BERT_CACHE, 'distributed_{}'.format(args.local_rank)), num_choices=4) model.load_state_dict(torch.load(args.init_checkpoint, map_location='cpu'), strict=False) if args.fp16: model.half() model.to(device) if args.local_rank != -1: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) model = DDP(model) elif n_gpu > 1: model = torch.nn.DataParallel(model) # Prepare optimizer param_optimizer = list(model.named_parameters()) # hack to remove pooler, which is not used # thus it produce None grad that break apex param_optimizer = [n for n in param_optimizer if 'pooler' not in n[0]] no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] if args.fp16: try: from apex.optimizers import FP16_Optimizer from apex.optimizers import FusedAdam except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) optimizer = FusedAdam(optimizer_grouped_parameters, lr=args.learning_rate, bias_correction=False, max_grad_norm=1.0) if args.loss_scale == 0: optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) else: optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale) else: optimizer = BertAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=num_train_optimization_steps) global_step = 0 if args.do_train: train_features = convert_examples_to_features(train_examples, tokenizer, args.max_seq_length, True) logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_examples)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_optimization_steps) all_input_ids = torch.tensor(select_field(train_features, 'input_ids'), dtype=torch.long) all_input_mask = torch.tensor(select_field(train_features, 'input_mask'), dtype=torch.long) all_segment_ids = torch.tensor(select_field(train_features, 'segment_ids'), dtype=torch.long) all_label = torch.tensor([f.label for f in train_features], dtype=torch.long) train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label) if args.local_rank == -1: train_sampler = RandomSampler(train_data) else: train_sampler = DistributedSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) model.train() for _ in trange(int(args.num_train_epochs), desc="Epoch"): tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 for step, batch in enumerate( tqdm(train_dataloader, desc="Iteration")): # Terminate early for benchmarking if args.max_steps > 0 and global_step > args.max_steps: break batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, label_ids = batch loss = model(input_ids, segment_ids, input_mask, label_ids) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.fp16 and args.loss_scale != 1.0: # rescale loss for fp16 training # see https://docs.nvidia.com/deeplearning/sdk/mixed-precision-training/index.html loss = loss * args.loss_scale if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps tr_loss += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 if args.fp16: optimizer.backward(loss) else: loss.backward() if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16: # modify learning rate with special warm up BERT uses # if args.fp16 is False, BertAdam is used that handles this automatically lr_this_step = args.learning_rate * warmup_linear( global_step / num_train_optimization_steps, args.warmup_proportion) for param_group in optimizer.param_groups: param_group['lr'] = lr_this_step optimizer.step() optimizer.zero_grad() global_step += 1 if args.do_train: # Save a trained model and the associated configuration model_to_save = model.module if hasattr( model, 'module') else model # Only save the model it-self output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME) torch.save(model_to_save.state_dict(), output_model_file) output_config_file = os.path.join(args.output_dir, CONFIG_NAME) with open(output_config_file, 'w') as f: f.write(model_to_save.config.to_json_string()) # Load a trained model and config that you have fine-tuned config = BertConfig(output_config_file) model = BertForMultipleChoice(config, num_choices=4) # noinspection PyUnresolvedReferences model.load_state_dict(torch.load(output_model_file)) else: model = BertForMultipleChoice.from_pretrained(args.bert_model, num_choices=4) model.load_state_dict(torch.load(args.init_checkpoint, map_location='cpu'), strict=False) model.to(device) if args.do_eval and (args.local_rank == -1 or torch.distributed.get_rank() == 0): eval_examples = read_swag_examples(os.path.join( args.data_dir, 'val.csv'), is_training=True) eval_features = convert_examples_to_features(eval_examples, tokenizer, args.max_seq_length, True) logger.info("***** Running evaluation *****") logger.info(" Num examples = %d", len(eval_examples)) logger.info(" Batch size = %d", args.eval_batch_size) all_input_ids = torch.tensor(select_field(eval_features, 'input_ids'), dtype=torch.long) all_input_mask = torch.tensor(select_field(eval_features, 'input_mask'), dtype=torch.long) all_segment_ids = torch.tensor(select_field(eval_features, 'segment_ids'), dtype=torch.long) all_label = torch.tensor([f.label for f in eval_features], dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label) # Run prediction for full data eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) model.eval() eval_loss, eval_accuracy = 0, 0 nb_eval_steps, nb_eval_examples = 0, 0 for input_ids, input_mask, segment_ids, label_ids in tqdm( eval_dataloader, desc="Evaluating"): input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) label_ids = label_ids.to(device) with torch.no_grad(): tmp_eval_loss = model(input_ids, segment_ids, input_mask, label_ids) logits = model(input_ids, segment_ids, input_mask) logits = logits.detach().cpu().numpy() label_ids = label_ids.to('cpu').numpy() tmp_eval_accuracy = accuracy(logits, label_ids) eval_loss += tmp_eval_loss.mean().item() eval_accuracy += tmp_eval_accuracy nb_eval_examples += input_ids.size(0) nb_eval_steps += 1 eval_loss = eval_loss / nb_eval_steps eval_accuracy = eval_accuracy / nb_eval_examples # noinspection PyUnboundLocalVariable result = { 'eval_loss': eval_loss, 'eval_accuracy': eval_accuracy, 'global_step': global_step, 'loss': tr_loss / nb_tr_steps } output_eval_file = os.path.join(args.output_dir, "eval_results.txt") with open(output_eval_file, "w") as writer: logger.info("***** Eval results *****") for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key])))
def main(): args = arguments.get_argparse("multiple_choice") logger.info(json.dumps(args.__dict__)) if args.eval_on_train and not args.log_spec: args.log_spec = "on_train" processors = { "race": dataset_processor.RaceProcessor, "mctest": dataset_processor.MCTestProcessor, "swag": dataset_processor.SwagProcessor, "squad": dataset_processor.SquadProcessor, "openbookqa": dataset_processor.OpenBookQAProcessor, "multirc": dataset_processor.MultiRCProcessor, "arc": dataset_processor.ARCProcessor, "qa4mre": dataset_processor.QA4MREProcessor, } if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend # which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend="nccl") if args.fp16: logger.info( """16-bits training currently not supported in distributed training""") # (see https://github.com/pytorch/pytorch/pull/13496) args.fp16 = False logger.info( "device %s n_gpu %d distributed training %r", device, n_gpu, bool(args.local_rank != -1), ) if args.gradient_accumulation_steps < 1: raise ValueError("""Invalid gradient_accumulation_steps parameter: {}, should be >= 1""".format(args.gradient_accumulation_steps)) args.train_batch_size = int(args.train_batch_size / args.gradient_accumulation_steps) random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if not args.do_train and not args.do_eval and not args.do_test: raise ValueError( """At least one of `do_train` or `do_eval` or `do_test` must be True.""") if (args.do_train or args.do_eval) and args.do_test: raise ValueError( "Runing test must be independent of running train and/or dev") bert_config = BertConfig.from_json_file(args.bert_config_file) if args.max_seq_length > bert_config.max_position_embeddings: raise ValueError( """Cannot use sequence length {} because the BERT model was only trained up to sequence length {}""".format( args.max_seq_length, bert_config.max_position_embeddings)) if args.small_debug: args.output_dir = 'debug' if os.path.exists(args.output_dir): if not os.listdir(args.output_dir) == ["args_log.txt" ] and not args.small_debug: raise ValueError( "Output directory already exists and is not empty.") os.makedirs(args.output_dir, exist_ok=True) os.makedirs(args.cache_dir, exist_ok=True) args_log = os.path.join(args.output_dir, "args_log.txt") if not os.path.exists(args_log): with open(args_log, "w") as writer: writer.write(json.dumps(args.__dict__)) else: print("args_log.txt already exists") task_name = args.task_name.lower() if task_name not in processors: raise ValueError("Task not found: %s" % (task_name)) if "{}" in args.corenlp_cache_dir: args.corenlp_cache_dir = args.corenlp_cache_dir.format(task_name) processor = processors[task_name](args.data_dir, args.dataset_option) num_options = processor.get_num_options() if args.convert_from_ans_extr: if args.do_train: if args.train_predictions: processor.set_candidates("train", args.train_predictions) else: raise ValueError("train prediction file is missing") if args.do_eval: if args.eval_predictions: processor.set_candidates("dev", args.eval_predictions) else: raise ValueError("eval prediction file is missing") tokenizer = tokenization.FullTokenizer(vocab_file=args.vocab_file, do_lower_case=args.do_lower_case) train_examples = None num_train_steps = None def cache_features(examples, split_name): cache_spec_cand = [ task_name, args.dataset_option, split_name, args.input_ablation, ] cache_spec = "_".join( [str(x) for x in cache_spec_cand if x is not None]) cache_path = os.path.join(args.cache_dir, "{}.pkl".format(cache_spec)) if os.path.exists(cache_path) and not args.no_cache: features = pickle.load(open(cache_path, "rb")) else: if args.input_ablation or args.output_statistics: corenlp_cache_path = os.path.join( args.corenlp_cache_dir, "{}_{}.pkl".format(task_name, split_name)) corenlp_cache = pickle.load(open(corenlp_cache_path, "rb")) else: corenlp_cache = None if args.output_statistics: output_mcmrc.output_statistics(examples, corenlp_cache) tokenized_examples = generate_tokenized_examples( examples, tokenizer, args.input_ablation, corenlp_cache, args.entity_anonymization) if args.output_mturk or args.output_examples: for ex in examples: ex.input_ablation = "original" original_examples = generate_tokenized_examples( examples, tokenizer, None, None) if args.output_examples: output_mcmrc.output_examples( tokenized_examples, original_examples, task_name, 'ent_anon' if args.entity_anonymization else args.input_ablation, ) if args.output_mturk: output_mcmrc.output_mturk(tokenized_examples, original_examples, task_name, args.input_ablation) exit(1) features = convert_examples_to_features( tokenized_examples, num_options, args.max_seq_length, args.max_query_length, args.max_option_length, tokenizer, ) if not args.no_cache: with open(cache_path, "wb") as f: pickle.dump(features, f) # assert len(examples) == len(features) return features if args.do_train: train_examples = processor.get_train_examples() if args.small_debug: train_examples = train_examples[:6000] num_train_per_epoch = len(train_examples) num_train_per_epoch /= args.train_batch_size num_train_per_epoch /= args.gradient_accumulation_steps num_train_steps = int(num_train_per_epoch * args.num_train_epochs) train_features = cache_features(train_examples, "train") if args.do_eval: if args.eval_on_train: eval_examples = processor.get_train_examples() eval_features = cache_features(eval_examples, "train") else: eval_examples = processor.get_dev_examples() if args.small_debug: eval_examples = eval_examples[:1000] eval_features = cache_features(eval_examples, "dev") if args.do_test: eval_examples = processor.get_test_examples() eval_features = cache_features(eval_examples, "test") global entity_set if args.entity_anonymization: if len(entity_set) == 0: anon_tag_cache_file = os.path.join( args.cache_dir, f'{task_name}_anon_tags_{args.entity_anonymization}.pkl') if not os.path.exists(anon_tag_cache_file): raise ValueError("vocabulary cache cannot be loaded") entity_set = pickle.load(open(anon_tag_cache_file, 'rb')) tokenizer.vocab_update(sorted(entity_set)) else: anon_tag_cache_file = os.path.join( args.cache_dir, f'{task_name}_anon_tags_{args.entity_anonymization}.pkl') if not os.path.exists(anon_tag_cache_file): with open(anon_tag_cache_file, 'wb') as f: pickle.dump(entity_set, f) # Prepare model model = BertForMultipleChoice(bert_config, num_options) if args.init_checkpoint is not None: state_dict = torch.load(args.init_checkpoint, map_location="cpu") if list(state_dict)[0].startswith("bert."): # finetuned on some target dataset model.load_state_dict(state_dict) else: # pretrained language model model.bert.load_state_dict(state_dict) if args.entity_anonymization and len(entity_set): model.bert.embeddings.extend_word_embeddings(len(entity_set)) if args.limit_vocab_size or args.limit_vocab_freq: use_vocab, train_features, eval_features = vocab_selection( train_features, eval_features, args.cache_dir, args.output_dir, task_name, tokenizer, args.entity_anonymization, args.limit_vocab_size, args.limit_vocab_freq, num_options=num_options, ) id_to_token = {v: k for k, v in tokenizer.vocab.items()} use_tokens = [id_to_token[i] for i in use_vocab] logger.info(sorted(use_tokens)) logger.info(f'{len(use_tokens)}') model.bert.embeddings.limit_vocab(use_vocab) if args.fp16: model.half() model.to(device) if args.local_rank != -1: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.local_rank], output_device=args.local_rank) elif n_gpu > 1: model = torch.nn.DataParallel(model) # Prepare optimizer if args.fp16: param_optimizer = [ (n, param.clone().detach().to("cpu").float().requires_grad_()) for n, param in model.named_parameters() ] elif args.optimize_on_cpu: param_optimizer = [(n, param.clone().detach().to("cpu").requires_grad_()) for n, param in model.named_parameters()] else: param_optimizer = list(model.named_parameters()) no_decay = ["bias", "gamma", "beta"] optimizer_grouped_parameters = [ { "params": [p for n, p in param_optimizer if n not in no_decay], "weight_decay_rate": 0.01, }, { "params": [p for n, p in param_optimizer if n in no_decay], "weight_decay_rate": 0.0, }, ] optimizer = BERTAdam( optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=num_train_steps, ) global_step = 0 if args.enter_debugger: model.eval() # features = convert_examples_to_features( # eval_examples, # num_options, # args.max_seq_length, # args.max_query_length, # args.max_option_length, # tokenizer, # ) # output = get_predictions( # model, eval_examples, features, args, device # ) # output_logits, output_predictions, eval_loss, eval_accuracy = output print("in debugger") import pdb pdb.set_trace() def eval_func(num_epoch=-1, num_step=-1, log_spec=None): logger.info("***** Running evaluation *****") logger.info(" Num examples = %d", len(eval_examples)) logger.info(" Batch size = %d", args.eval_batch_size) model.eval() output = get_predictions(model, eval_examples, eval_features, args, device) output_logits, output_predictions, eval_loss, eval_accuracy = output model.train() output_qids = [e.qid for e in eval_examples] output_answers = [e.ans_idx for e in eval_examples] result = { "eval_loss": eval_loss, "eval_accuracy": eval_accuracy, "global_step": global_step, } output_spec = "" if num_epoch > -1 and num_step > -1: output_spec = "_{}_{}".format(num_epoch, num_step) elif log_spec: output_spec += "_{}".format(log_spec) output_eval_file = os.path.join( args.output_dir, "eval_results{}.json".format(output_spec)) result["spec"] = output_spec logger.info("***** Eval results *****") for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) with open(output_eval_file, "w") as writer: writer.write(json.dumps(result)) output_pred_file = os.path.join( args.output_dir, "eval_preds{}.jsonl".format(output_spec)) with open(output_pred_file, "w") as f: for qid, ans, pred, logit in zip(output_qids, output_answers, output_predictions, output_logits): result = { "qid": qid, "answer": chr(ans + ord("A")), "prediction": chr(pred + ord("A")), "logits": logit.tolist(), } f.write(json.dumps(result) + "\n") if args.do_train: logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_examples)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_steps) all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long) all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.long) train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) if args.local_rank == -1: train_sampler = RandomSampler(train_data) else: train_sampler = DistributedSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) model.train() for i in trange(int(args.num_train_epochs), desc="Epoch"): tr_loss = 0 tmp_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 for step, batch in enumerate( tqdm(train_dataloader, desc="Iteration")): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, label_ids = batch loss, _ = model(input_ids, segment_ids, input_mask, label_ids) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.fp16 and args.loss_scale != 1.0: # rescale loss for fp16 training # see https://docs.nvidia.com/deeplearning/.. # ..sdk/mixed-precision-training/index.html loss = loss * args.loss_scale if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps loss.backward() tr_loss += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16 or args.optimize_on_cpu: if args.fp16 and args.loss_scale != 1.0: # scale down gradients for fp16 training for param in model.parameters(): param.grad.data /= args.loss_scale is_nan = set_optimizer_params_grad( param_optimizer, model.named_parameters(), test_nan=True) if is_nan: logger.info("""FP16 TRAINING: Nan in gradients, reducing loss scaling""") args.loss_scale /= 2 model.zero_grad() continue optimizer.step() copy_optimizer_params_to_model( model.named_parameters(), param_optimizer) else: optimizer.step() model.zero_grad() global_step += 1 if global_step % args.save_model_steps == 0: output_model_file = os.path.join( args.output_dir, "pytorch_model_step_{}.bin".format(global_step), ) if n_gpu > 1: torch.save(model.module.state_dict(), output_model_file) else: torch.save(model.state_dict(), output_model_file) tmp_loss += loss.item() if (args.loss_report_steps > 0 and global_step % args.loss_report_steps == 0): logger.info("Step loss: {}".format( tmp_loss / args.loss_report_steps)) tmp_loss = 0 if (args.eval_steps > 0 and global_step > 0 and global_step % args.eval_steps == 0 and args.do_eval): eval_func(i, global_step, args.log_spec) output_model_file = os.path.join( args.output_dir, "pytorch_model_epoch_{}.bin".format(i)) if n_gpu > 1: torch.save(model.module.state_dict(), output_model_file) else: torch.save(model.state_dict(), output_model_file) if args.do_eval: eval_func(i, global_step, args.log_spec) if not args.do_train and args.do_eval: eval_func(log_spec=args.log_spec or "dev") if args.do_test: eval_func(log_spec=args.log_spec or "test")
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument("--data_dir", default=None, type=str, required=True, help="The input data dir. Should contain the .tsv files (or other data files) for the task.") parser.add_argument("--bert_model", default=None, type=str, required=True, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, " "bert-base-multilingual-cased, bert-base-chinese.") parser.add_argument("--task_name", default=None, type=str, required=True, help="The name of the task to train.") parser.add_argument("--output_dir", default=None, type=str, required=True, help="The output directory where the model predictions and checkpoints will be written.") ## Other parameters parser.add_argument("--max_seq_length", default=128, type=int, help="The maximum total input sequence length after WordPiece tokenization. \n" "Sequences longer than this will be truncated, and sequences shorter \n" "than this will be padded.") parser.add_argument("--do_train", action='store_true', help="Whether to run training.") parser.add_argument("--do_eval", action='store_true', help="Whether to run eval on the dev set.") parser.add_argument("--do_lower_case", action='store_true', help="Set this flag if you are using an uncased model.") parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.") parser.add_argument("--eval_batch_size", default=8, type=int, help="Total batch size for eval.") parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument("--warmup_proportion", default=0.1, type=float, help="Proportion of training to perform linear learning rate warmup for. " "E.g., 0.1 = 10%% of training.") parser.add_argument("--no_cuda", action='store_true', help="Whether not to use CUDA when available") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument('--gradient_accumulation_steps', type=int, default=1, help="Number of updates steps to accumulate before performing a backward/update pass.") parser.add_argument('--fp16', action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument('--loss_scale', type=float, default=0, help="Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.\n") args = parser.parse_args() processors = { "cola": ColaProcessor, "mnli": MnliProcessor, "mrpc": MrpcProcessor, "commonsenseqa": CommonsenseQaProcessor, } num_labels_task = { "cola": 2, "mnli": 3, "mrpc": 2, "commonsenseqa":4, } # if args.local_rank == -1 or args.no_cuda: # device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") # n_gpu = torch.cuda.device_count() # else: # torch.cuda.set_device(args.local_rank) # device = torch.device("cuda", args.local_rank) # n_gpu = 1 # # Initializes the distributed backend which will take care of sychronizing nodes/GPUs # torch.distributed.init_process_group(backend='nccl') device = "cuda:2" n_gpu = 1 logger.info("device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".format( device, n_gpu, bool(args.local_rank != -1), args.fp16)) print("device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".format( device, n_gpu, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError("Invalid gradient_accumulation_steps parameter: {}, should be >= 1".format( args.gradient_accumulation_steps)) args.train_batch_size = int(args.train_batch_size / args.gradient_accumulation_steps) random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if not args.do_train and not args.do_eval: raise ValueError("At least one of `do_train` or `do_eval` must be True.") if os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train: raise ValueError("Output directory ({}) already exists and is not empty.".format(args.output_dir)) os.makedirs(args.output_dir, exist_ok=True) task_name = args.task_name.lower() if task_name not in processors: raise ValueError("Task not found: %s" % (task_name)) print("current task is " + str(task_name)) processor = processors[task_name]() num_labels = num_labels_task[task_name] label_list = processor.get_labels() tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) train_examples = None num_train_steps = None if args.do_train: train_examples = processor.get_train_examples(args.data_dir) num_train_steps = int( len(train_examples) / args.train_batch_size / args.gradient_accumulation_steps * args.num_train_epochs) # Prepare model # model = BertForSequenceClassification.from_pretrained(args.bert_model, # cache_dir=PYTORCH_PRETRAINED_BERT_CACHE / 'distributed_{}'.format( # args.local_rank), # num_labels=num_labels) model = BertForMultipleChoice.from_pretrained(args.bert_model, cache_dir=PYTORCH_PRETRAINED_BERT_CACHE / 'distributed_{}'. format(args.local_rank), num_choices=num_labels) if args.fp16: model.half() model.to(device) if args.local_rank != -1: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.") model = DDP(model) elif n_gpu > 1: model = torch.nn.DataParallel(model) # Prepare optimizer param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [ {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01}, {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} ] t_total = num_train_steps if args.local_rank != -1: t_total = t_total // torch.distributed.get_world_size() if args.fp16: try: from apex.optimizers import FP16_Optimizer from apex.optimizers import FusedAdam except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.") optimizer = FusedAdam(optimizer_grouped_parameters, lr=args.learning_rate, bias_correction=False, max_grad_norm=1.0) if args.loss_scale == 0: optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) else: optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale) else: optimizer = BertAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=t_total) global_step = 0 nb_tr_steps = 0 tr_loss = 0 best_eval_accuracy = 0.0 if args.do_train: train_features = convert_examples_to_features_mc( train_examples, label_list, args.max_seq_length, tokenizer) logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_examples)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_steps) all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long) all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.long) train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) if args.local_rank == -1: train_sampler = RandomSampler(train_data) else: train_sampler = DistributedSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) model.train() # Save a trained model model_to_save = model.module if hasattr(model, 'module') else model # Only save the model it-self output_model_file = os.path.join(args.output_dir, "pytorch_model.bin") for _ in trange(int(args.num_train_epochs), desc="Epoch"): tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration")): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, label_ids = batch loss, logits = model(input_ids, segment_ids, input_mask, label_ids) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: optimizer.backward(loss) else: loss.backward() tr_loss += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 if (step + 1) % args.gradient_accumulation_steps == 0: # modify learning rate with special warm up BERT uses lr_this_step = args.learning_rate * warmup_linear(global_step / t_total, args.warmup_proportion) for param_group in optimizer.param_groups: param_group['lr'] = lr_this_step optimizer.step() optimizer.zero_grad() global_step += 1 if args.do_eval and (args.local_rank == -1 or torch.distributed.get_rank() == 0): eval_examples = processor.get_dev_examples(args.data_dir) eval_features = convert_examples_to_features_mc( eval_examples, label_list, args.max_seq_length, tokenizer) logger.info("***** Running evaluation *****") logger.info(" Num examples = %d", len(eval_examples)) logger.info(" Batch size = %d", args.eval_batch_size) all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long) all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) # Run prediction for full data eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) model.eval() eval_loss, eval_accuracy = 0, 0 nb_eval_steps, nb_eval_examples = 0, 0 for input_ids, input_mask, segment_ids, label_ids in tqdm(eval_dataloader, desc="Evaluating"): input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) label_ids = label_ids.to(device) with torch.no_grad(): tmp_eval_loss, logits = model(input_ids, segment_ids, input_mask, label_ids) # logits = model(input_ids, segment_ids, input_mask) logits = logits.detach().cpu().numpy() label_ids = label_ids.to('cpu').numpy() tmp_eval_accuracy = accuracy(logits, label_ids) eval_loss += tmp_eval_loss.mean().item() eval_accuracy += tmp_eval_accuracy nb_eval_examples += input_ids.size(0) nb_eval_steps += 1 eval_accuracy = eval_accuracy / nb_eval_examples print("the current eval accuracy is: " + str(eval_accuracy)) if eval_accuracy > best_eval_accuracy: best_eval_accuracy = eval_accuracy if args.do_train: torch.save(model_to_save.state_dict(), output_model_file) model.train() # # Save a trained model # model_to_save = model.module if hasattr(model, 'module') else model # Only save the model it-self # output_model_file = os.path.join(args.output_dir, "pytorch_model.bin") # if args.do_train: # torch.save(model_to_save.state_dict(), output_model_file) # Load a trained model that you have fine-tuned model_state_dict = torch.load(output_model_file) model = BertForMultipleChoice.from_pretrained(args.bert_model, state_dict=model_state_dict, num_choices=num_labels) model.to(device) if args.do_eval and (args.local_rank == -1 or torch.distributed.get_rank() == 0): eval_examples = processor.get_dev_examples(args.data_dir) eval_features = convert_examples_to_features_mc( eval_examples, label_list, args.max_seq_length, tokenizer) logger.info("***** Running evaluation *****") logger.info(" Num examples = %d", len(eval_examples)) logger.info(" Batch size = %d", args.eval_batch_size) all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long) all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) # Run prediction for full data eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) model.eval() eval_loss, eval_accuracy = 0, 0 nb_eval_steps, nb_eval_examples = 0, 0 all_pred_labels = [] all_anno_labels = [] all_logits = [] for input_ids, input_mask, segment_ids, label_ids in tqdm(eval_dataloader, desc="Evaluating"): input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) label_ids = label_ids.to(device) with torch.no_grad(): tmp_eval_loss, logits = model(input_ids, segment_ids, input_mask, label_ids) # logits = model(input_ids, segment_ids, input_mask) logits = logits.detach().cpu().numpy() label_ids = label_ids.to('cpu').numpy() output_labels = np.argmax(logits, axis=1) all_pred_labels.extend(output_labels.tolist()) all_logits.extend(list(logits)) all_anno_labels.extend(list(label_ids)) tmp_eval_accuracy = accuracy(logits, label_ids) eval_loss += tmp_eval_loss.mean().item() eval_accuracy += tmp_eval_accuracy nb_eval_examples += input_ids.size(0) nb_eval_steps += 1 eval_loss = eval_loss / nb_eval_steps eval_accuracy = eval_accuracy / nb_eval_examples loss = tr_loss / nb_tr_steps if args.do_train else None result = {'eval_loss': eval_loss, 'eval_accuracy': eval_accuracy, 'best_eval_accuracy': best_eval_accuracy, 'global_step': global_step, 'loss': loss} output_eval_file = os.path.join(args.output_dir, "eval_results.txt") with open(output_eval_file, "w") as writer: logger.info("***** Eval results *****") for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) for i in range(len(all_pred_labels)): writer.write(str(i) + "\t" + str(all_anno_labels[i]) + "\t" + str(all_pred_labels[i]) + "\t" + str(all_logits[i]) + "\n")
def main(start_index=0, end_index=0): num_train_epochs = 3 learning_rate = 3e-5 max_seq_length = 58 train_batch_size = 40 warmup_proportion = 0.1 seed = 1979 gradient_accumulation_steps = 1 margin = 0.37 l2_reg = 0.02 do_margin_loss = 1 train_batch_size = int(train_batch_size / gradient_accumulation_steps) eval_batch_size = train_batch_size random.seed(seed) np.random.seed(seed) torch.manual_seed(seed) torch.cuda.manual_seed_all(seed) train_examples = load_copa_data('./data/COPA/train.jsonl') num_train_steps = int( len(train_examples) / train_batch_size / gradient_accumulation_steps * num_train_epochs) eval_examples = load_copa_data('./data/COPA/val.jsonl') test_examples = load_copa_data_from_csv('./data/COPA/test.csv') # Prepare model tokenizer = BertTokenizer.from_pretrained("bert-base-uncased", do_lower_case=True) if do_margin_loss: model = BertForMultipleChoice.from_pretrained("bert-base-uncased", num_choices=2, margin=margin) else: model = BertForMultipleChoice.from_pretrained("bert-base-uncased", num_choices=2) model.cuda() # Prepare optimizer param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'gamma', 'beta'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay_rate': l2_reg }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay_rate': 0.0 }] t_total = num_train_steps optimizer = BertAdam(optimizer_grouped_parameters, lr=learning_rate, warmup=warmup_proportion, t_total=t_total, b2=0.98) # Prep Eval Data eval_features = convert_examples_to_features(eval_examples, tokenizer, max_seq_length, True) eval_input_ids = torch.tensor(select_field(eval_features, 'input_ids'), dtype=torch.long) eval_input_mask = torch.tensor(select_field(eval_features, 'input_mask'), dtype=torch.long) eval_segment_ids = torch.tensor(select_field(eval_features, 'segment_ids'), dtype=torch.long) eval_label = torch.tensor([f.label for f in eval_features], dtype=torch.long) eval_data = TensorDataset(eval_input_ids, eval_input_mask, eval_segment_ids, eval_label) eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=eval_batch_size) # Prep Test Data test_features = convert_examples_to_features(test_examples, tokenizer, max_seq_length, True) test_input_ids = torch.tensor(select_field(test_features, 'input_ids'), dtype=torch.long) test_input_mask = torch.tensor(select_field(test_features, 'input_mask'), dtype=torch.long) test_segment_ids = torch.tensor(select_field(test_features, 'segment_ids'), dtype=torch.long) test_label = torch.tensor([f.label for f in test_features], dtype=torch.long) test_data = TensorDataset(test_input_ids, test_input_mask, test_segment_ids, test_label) test_sampler = SequentialSampler(test_data) test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=eval_batch_size) # Prep Training Data train_features = convert_examples_to_features(train_examples, tokenizer, max_seq_length, True) train_input_ids = torch.tensor(select_field(train_features, 'input_ids'), dtype=torch.long) train_input_mask = torch.tensor(select_field(train_features, 'input_mask'), dtype=torch.long) train_segment_ids = torch.tensor(select_field(train_features, 'segment_ids'), dtype=torch.long) train_label = torch.tensor([f.label for f in train_features], dtype=torch.long) train_data = TensorDataset(train_input_ids, train_input_mask, train_segment_ids, train_label) train_sampler = RandomSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=train_batch_size) print("***** Running training *****") print(" Num examples = %d", len(train_examples)) print(" Batch size = %d", train_batch_size) print(" Num steps = %d", num_train_steps) global_step = 0 eval_acc_list = [] for epoch in range(num_train_epochs): tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 for step, batch in enumerate( tqdm(train_dataloader, desc=(f'Epoch: {epoch}'))): model.train() batch = tuple(t.cuda() for t in batch) input_ids, input_mask, segment_ids, label_ids = batch loss = model(input_ids, segment_ids, input_mask, label_ids) if gradient_accumulation_steps > 1: loss = loss / gradient_accumulation_steps loss.backward() tr_loss += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 if (step + 1) % gradient_accumulation_steps == 0: optimizer.step() model.zero_grad() global_step += 1 logits_all, eval_accuracy = do_evaluation(model, eval_dataloader, is_training=False) tqdm.write(f'\nEvaluation Accuracy: {eval_accuracy}\n') eval_acc_list.append(eval_accuracy) logits_all, best_test_acc = do_evaluation(model, test_dataloader, is_training=False) print(f'Testing Accuracy: {best_test_acc}')