def load_mc_model(self, model_dir='models/lp/', do_lower_case=True): num_choices = 5 config_file = os.path.join(model_dir, 'bert_mc_config.json') model_file = os.path.join(model_dir, 'bert_mc_model.bin') config = BertConfig(config_file) model = BertForMultipleChoice(config, num_choices=num_choices) model.load_state_dict( torch.load(model_file, map_location=torch.device( "cuda" if torch.cuda.is_available() else "cpu"))) tokenizer = BertTokenizer.from_pretrained(model_dir, do_lower_case=do_lower_case) return model, tokenizer
def bertForMultipleChoice(*args, **kwargs): """ BertForMultipleChoice is a fine-tuning model that includes BertModel and a linear layer on top of the BertModel. Note that the multiple choice head is only initialized and has to be trained. Args: num_choices: the number (>=2) of classes for the classifier. Example: # Load the tokenizer >>> import torch >>> tokenizer = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'bertTokenizer', 'bert-base-cased', do_basic_tokenize=False) # Prepare tokenized input >>> text = "[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]" >>> tokenized_text = tokenizer.tokenize(text) >>> indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text) >>> segments_ids = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1] >>> tokens_tensor = torch.tensor([indexed_tokens, indexed_tokens]).unsqueeze(0) >>> segments_tensors = torch.tensor([segments_ids, segments_ids]).unsqueeze(0) # Load bertForMultipleChoice >>> model = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'bertForMultipleChoice', 'bert-base-cased', num_choices=2) >>> model.eval() # Predict the multiple choice logits >>> with torch.no_grad(): multiple_choice_logits = model(tokens_tensor, segments_tensors) # Or get the multiple choice loss >>> labels = torch.tensor([1]) >>> multiple_choice_loss = model(tokens_tensor, segments_tensors, labels=labels) # set model.train() before if training this loss """ model = BertForMultipleChoice.from_pretrained(*args, **kwargs) return model
def bertForMultipleChoice(*args, **kwargs): """ BertForMultipleChoice is a fine-tuning model that includes BertModel and a linear layer on top of the BertModel. """ model = BertForMultipleChoice.from_pretrained(*args, **kwargs) return model
def main(): data_dir = 'RACE' device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # device = "cpu" bert_model = 'bert-base-uncased' tokenizer = BertTokenizer.from_pretrained(bert_model, do_lower_case=True) model = BertForMultipleChoice.from_pretrained( bert_model, cache_dir=PYTORCH_PRETRAINED_BERT_CACHE / 'model', num_choices=4) model.to(device) PATH = os.path.join('large_models', 'pytorch_model_2epochs.bin') model.load_state_dict(torch.load(PATH)) model.to(device) max_seq_length = 200 output_dir = 'large_models' dev_data = process_race(data_dir + '/', 'test') dev_features = convert_to_bert_style(dev_data, tokenizer, max_seq_length) dev_data = prepare_data(dev_features) dev_sampler = SequentialSampler(dev_data) dev_dataloader = DataLoader(dev_data, sampler=dev_sampler, batch_size=1) model.eval() dev_loss, dev_accuracy = 0, 0 num_eval_examples, num_eval_steps = 0, 0 for step, batch in enumerate(dev_dataloader): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, label_ids = batch with torch.no_grad(): tmp_eval_loss = model(input_ids, segment_ids, input_mask, label_ids) logits = model(input_ids, segment_ids, input_mask) print(100 * step / len(dev_dataloader)) logits = logits.detach().cpu().numpy() label_ids = label_ids.to('cpu').numpy() tmp_eval_accuracy = accuracy(logits, label_ids) dev_loss += tmp_eval_loss.mean().item() dev_accuracy += tmp_eval_accuracy num_eval_examples += input_ids.size(0) num_eval_steps += 1 del batch dev_loss = dev_loss / num_eval_steps dev_accuracy = dev_accuracy / num_eval_examples output_eval_file = os.path.join(output_dir, "dev_results.txt") result = {'dev_loss': dev_loss, 'dev_accuracy': dev_accuracy} with open(output_eval_file, "a+") as writer: logger.info("***** Eval results *****") for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key])))
def bertForMultipleChoice(*args, **kwargs): """ BertForMultipleChoice is a fine-tuning model that includes BertModel and a linear layer on top of the BertModel. Args: num_choices: the number (>=2) of classes for the classifier. Example: >>> torch.hub.load('huggingface/pytorch-pretrained-BERT', 'bertForMultipleChoice', 'bert-base-cased', num_choices=2, force_reload=True) """ model = BertForMultipleChoice.from_pretrained(*args, **kwargs) return model
def test_BertForMultipleChoice(): input_ids = torch.LongTensor([[[31, 51, 99], [15, 5, 0]], [[12, 16, 42], [14, 28, 57]]]) input_mask = torch.LongTensor([[[1, 1, 1], [1, 1, 0]], [[1, 1, 0], [1, 0, 0]]]) token_type_ids = torch.LongTensor([[[0, 0, 1], [0, 1, 0]], [[0, 1, 1], [0, 0, 1]]]) config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768, num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072) num_choices = 2 model = BertForMultipleChoice(config, num_choices) print(model(input_ids, token_type_ids, input_mask))
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument("--data_dir", default=None, type=str, required=True, help="The input data dir. Should contain the .tsv files for the task.") parser.add_argument("--model_dir", default=None, type=str, required=True, help="The output directory where the model checkpoints are written.") args = parser.parse_args() # TODO: Fix hardcoded values max_seq_length = 512 bert_model = 'bert-base-cased' do_lower_case = False n_classe = 3 test_file = 'gap-development.tsv' batch_size = 1 # Load a trained model and config that you have fine-tuned config = BertConfig(os.path.join(args.model_dir, CONFIG_NAME)) model = BertForMultipleChoice(config, num_choices=3) model.load_state_dict(torch.load(os.path.join(args.model_dir, WEIGHTS_NAME))) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model.to(device) tokenizer = BertTokenizer.from_pretrained(bert_model, do_lower_case=do_lower_case) # Prepare test data examples = read_GAP_examples(os.path.join(args.data_dir, test_file), n_classes=n_classes), is_training=False) features = convert_examples_to_features(examples, tokenizer, max_seq_length, True) all_input_ids = torch.tensor(select_field(features, 'input_ids'), dtype=torch.long) all_input_mask = torch.tensor(select_field(features, 'input_mask'), dtype=torch.long) all_segment_ids = torch.tensor(select_field(features, 'segment_ids'), dtype=torch.long) data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids) dataloader = DataLoader(data, sampler=SequentialSampler(data), batch_size=batch_size, shuffle=False)
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument( "--data_dir", default="data/", type=str, help= "The input data dir. Should contain the .csv files (or other data files) for the task." ) parser.add_argument( "--bert_model", default=None, type=str, required=True, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese." ) parser.add_argument( "--output_dir", default="output/", type=str, help="The output directory where the model checkpoints will be written." ) parser.add_argument( "--load_model", default=None, required=True, type=str, help="The output directory where the model checkpoints will be written." ) parser.add_argument( "--output_name", default="result.csv", type=str, required=True, help="The output directory where the model checkpoints will be written." ) ## Other parameters parser.add_argument( "--max_seq_length", default=128, type=int, help= "The maximum total input sequence length after WordPiece tokenization. \n" "Sequences longer than this will be truncated, and sequences shorter \n" "than this will be padded.") parser.add_argument("--do_train", default=False, action='store_true', help="Whether to run training.") parser.add_argument("--do_eval", default=False, action='store_true', help="Whether to run eval on the dev set.") parser.add_argument( "--do_lower_case", default=False, action='store_true', help="Set this flag if you are using an uncased model.") parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.") parser.add_argument("--eval_batch_size", default=8, type=int, help="Total batch size for eval.") parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument( "--warmup_proportion", default=0.1, type=float, help= "Proportion of training to perform linear learning rate warmup for. " "E.g., 0.1 = 10%% of training.") parser.add_argument("--no_cuda", default=False, action='store_true', help="Whether not to use CUDA when available") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument( '--gradient_accumulation_steps', type=int, default=2, help= "Number of updates steps to accumulate before performing a backward/update pass." ) parser.add_argument( '--fp16', default=False, action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument( '--loss_scale', type=float, default=0, help= "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.\n") args = parser.parse_args() if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logger.info( "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}". format(device, n_gpu, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(args.gradient_accumulation_steps)) args.train_batch_size = int(args.train_batch_size / args.gradient_accumulation_steps) random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if not args.do_train and not args.do_eval: raise ValueError( "At least one of `do_train` or `do_eval` must be True.") # if os.path.exists(args.output_dir) and os.listdir(args.output_dir): # raise ValueError("Output directory ({}) already exists and is not empty.".format(args.output_dir)) os.makedirs(args.output_dir, exist_ok=True) tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) if args.local_rank != -1: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) model = DDP(model) elif n_gpu > 1: model = torch.nn.DataParallel(model) ## Load a trained model that you have fine-tuned # use this part if you want to load the trained model output_model_file = os.path.join(args.load_model) model_state_dict = torch.load(output_model_file) model = BertForMultipleChoice.from_pretrained(args.bert_model, state_dict=model_state_dict, num_choices=4) model.to(device) if args.do_eval and (args.local_rank == -1 or torch.distributed.get_rank() == 0): test_dir = os.path.join(args.data_dir, 'TestingData.csv') # test_dir = os.path.join(args.data_dir, 'dev_merge.csv') ## test high eval_examples = read_race_examples(test_dir) eval_features = convert_examples_to_features(eval_examples, tokenizer, args.max_seq_length, False) logger.info("***** Running evaluation: test high *****") # logger.info("Epoch: {}".format(ep+1)) logger.info(" Num examples = %d", len(eval_examples)) logger.info(" Batch size = %d", args.eval_batch_size) all_example_ids = exp_id(eval_features) all_input_ids = torch.tensor(select_field(eval_features, 'input_ids'), dtype=torch.long) all_input_mask = torch.tensor(select_field(eval_features, 'input_mask'), dtype=torch.long) all_segment_ids = torch.tensor(select_field(eval_features, 'segment_ids'), dtype=torch.long) # all_label = torch.tensor([f.label for f in eval_features], dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids) # Run prediction for full data eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) model.eval() high_eval_loss, high_eval_accuracy = 0, 0 high_nb_eval_steps, high_nb_eval_examples = 0, 0 predict = [] for step, batch in enumerate(eval_dataloader): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids = batch with torch.no_grad(): logits = model(input_ids, segment_ids, input_mask) logits = logits.detach().cpu().numpy() anws = np.argmax(logits, axis=1) if predict == []: predict = anws else: predict = np.concatenate((predict, anws), axis=0) print(predict) high_nb_eval_examples += input_ids.size(0) high_nb_eval_steps += 1 # output_eval_file = os.path.join(args.output_dir, "eval_results.txt") output_eval_file = os.path.join(args.output_dir, args.output_name) predict = predict + 1 predict = predict.tolist() id_li = [str(i) for i in range(1, len(predict) + 1)] df2 = pd.DataFrame({'ID': all_example_ids, 'Ans': predict}) df2.to_csv(output_eval_file, index=False, sep=',')
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument("--data_dir", default=None, type=str, required=True, help="The input data dir. Should contain the .csv files (or other data files) for the task.") parser.add_argument("--bert_model", default=None, type=str, required=True, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, " "bert-base-multilingual-cased, bert-base-chinese.") parser.add_argument("--output_dir", default=None, type=str, required=True, help="The output directory where the model checkpoints will be written.") ## Other parameters parser.add_argument("--max_seq_length", default=128, type=int, help="The maximum total input sequence length after WordPiece tokenization. \n" "Sequences longer than this will be truncated, and sequences shorter \n" "than this will be padded.") parser.add_argument("--do_train", action='store_true', help="Whether to run training.") parser.add_argument("--do_eval", action='store_true', help="Whether to run eval on the dev set.") parser.add_argument("--do_lower_case", action='store_true', help="Set this flag if you are using an uncased model.") parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.") parser.add_argument("--eval_batch_size", default=8, type=int, help="Total batch size for eval.") parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument("--warmup_proportion", default=0.1, type=float, help="Proportion of training to perform linear learning rate warmup for. " "E.g., 0.1 = 10%% of training.") parser.add_argument("--no_cuda", action='store_true', help="Whether not to use CUDA when available") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument('--gradient_accumulation_steps', type=int, default=1, help="Number of updates steps to accumulate before performing a backward/update pass.") parser.add_argument('--fp16', action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument('--loss_scale', type=float, default=0, help="Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.\n") args = parser.parse_args() device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") if args.gradient_accumulation_steps < 1: raise ValueError("Invalid gradient_accumulation_steps parameter: {}, should be >= 1".format( args.gradient_accumulation_steps)) args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if not args.do_train and not args.do_eval: raise ValueError("At least one of `do_train` or `do_eval` must be True.") #print(args.output_dir) if os.path.exists(args.output_dir) and os.listdir(args.output_dir): ValueError("Output directory ({}) already exists and is not empty.".format(args.output_dir)) if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) train_examples = None num_train_optimization_steps = None if args.do_train: train_examples = read_race(os.path.join(args.data_dir,"train")) print(len(train_examples), args.train_batch_size,args.gradient_accumulation_steps) num_train_optimization_steps = int( len(train_examples) / args.train_batch_size / args.gradient_accumulation_steps) * args.num_train_epochs # Prepare model model = BertForMultipleChoice.from_pretrained(args.bert_model, cache_dir=PYTORCH_PRETRAINED_BERT_CACHE, num_choices=4) if args.fp16: model.half() model.to(device) #Freeze the embedding network and encoder layers print("Freeze network") for name, param in model.named_parameters(): ln = 24 if name.startswith('bert.encoder'): l = name.split('.') ln = int(l[3]) if name.startswith('bert.embeddings') or ln < 6: print(name) param.requires_grad = False # Prepare optimizer param_optimizer = list(model.named_parameters()) # hack to remove pooler, which is not used # thus it produce None grad that break apex param_optimizer = [n for n in param_optimizer if 'pooler' not in n[0]] no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [ {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01}, {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} ] global_step = 0 if args.do_train: if args.fp16: try: from apex.optimizers import FP16_Optimizer from apex.optimizers import FusedAdam except ImportError: raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.") optimizer = FusedAdam(optimizer_grouped_parameters, lr=args.learning_rate, bias_correction=False, max_grad_norm=1.0) if args.loss_scale == 0: optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) else: optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale) warmup_linear = WarmupLinearSchedule(warmup=args.warmup_proportion, t_total=num_train_optimization_steps) else: optimizer = BertAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=num_train_optimization_steps) train_features = convert_examples_to_features( train_examples, tokenizer, args.max_seq_length, True) logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_examples)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_optimization_steps) all_input_ids = torch.tensor(select_field(train_features, 'input_ids'), dtype=torch.long) all_input_mask = torch.tensor(select_field(train_features, 'input_mask'), dtype=torch.long) all_segment_ids = torch.tensor(select_field(train_features, 'segment_ids'), dtype=torch.long) all_label = torch.tensor([f.label for f in train_features], dtype=torch.long) train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label) train_sampler = RandomSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) model.train() output_train_file = os.path.join(args.output_dir, "train_results.txt") loss_writer = open(output_train_file, "w",1) for _ in trange(int(args.num_train_epochs), desc="Epoch"): tr_loss = 0 last_tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration")): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, label_ids = batch loss = model(input_ids, segment_ids, input_mask, label_ids) # if n_gpu > 1: # loss = loss.mean() # mean() to average on multi-gpu. if args.fp16 and args.loss_scale != 1.0: # rescale loss for fp16 training # see https://docs.nvidia.com/deeplearning/sdk/mixed-precision-training/index.html loss = loss * args.loss_scale if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps tr_loss += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 if args.fp16: optimizer.backward(loss) else: loss.backward() if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16: # modify learning rate with special warm up BERT uses # if args.fp16 is False, BertAdam is used that handles this automatically lr_this_step = args.learning_rate * warmup_linear.get_lr(global_step/num_train_optimization_steps, args.warmup_proportion) for param_group in optimizer.param_groups: param_group['lr'] = lr_this_step optimizer.step() optimizer.zero_grad() global_step += 1 if nb_tr_examples % 512 == 0: loss_log = (tr_loss - last_tr_loss)*1.0/512 print(nb_tr_examples,loss_log) loss_writer.write("%d %f \n" % (nb_tr_examples,loss_log)) last_tr_loss = tr_loss if args.do_train: # Save a trained model, configuration and tokenizer model_to_save = model.module if hasattr(model, 'module') else model # Only save the model it-self # If we save using the predefined names, we can load using `from_pretrained` output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME) output_config_file = os.path.join(args.output_dir, CONFIG_NAME) torch.save(model_to_save.state_dict(), output_model_file) model_to_save.config.to_json_file(output_config_file) tokenizer.save_vocabulary(args.output_dir) # Load a trained model and vocabulary that you have fine-tuned model = BertForMultipleChoice.from_pretrained(args.output_dir, num_choices=4) tokenizer = BertTokenizer.from_pretrained(args.output_dir, do_lower_case=args.do_lower_case) else: output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME) model = BertForMultipleChoice.from_pretrained(args.output_dir, num_choices=4) tokenizer = BertTokenizer.from_pretrained(args.output_dir, do_lower_case=args.do_lower_case) model.to(device) if args.do_eval: eval_examples = read_race(os.path.join(args.data_dir,"dev")) eval_features = convert_examples_to_features( eval_examples, tokenizer, args.max_seq_length, True) logger.info("***** Running evaluation *****") logger.info(" Num examples = %d", len(eval_examples)) logger.info(" Batch size = %d", args.eval_batch_size) all_input_ids = torch.tensor(select_field(eval_features, 'input_ids'), dtype=torch.long) all_input_mask = torch.tensor(select_field(eval_features, 'input_mask'), dtype=torch.long) all_segment_ids = torch.tensor(select_field(eval_features, 'segment_ids'), dtype=torch.long) all_label = torch.tensor([f.label for f in eval_features], dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label) # Run prediction for full data eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) model.eval() tr_loss = 0 eval_loss, eval_accuracy = 0, 0 nb_eval_steps, nb_eval_examples = 0, 0 total_logits = [] total_labels = [] for input_ids, input_mask, segment_ids, label_ids in tqdm(eval_dataloader, desc="Evaluating"): input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) label_ids = label_ids.to(device) with torch.no_grad(): tmp_eval_loss = model(input_ids, segment_ids, input_mask, label_ids) logits = model(input_ids, segment_ids, input_mask) logits = logits.detach().cpu().numpy() label_ids = label_ids.to('cpu').numpy() tmp_eval_accuracy = accuracy(logits, label_ids) eval_loss += tmp_eval_loss.mean().item() eval_accuracy += tmp_eval_accuracy nb_eval_examples += input_ids.size(0) nb_eval_steps += 1 total_logits.append(logits) total_labels.append(label_ids) total_logits = np.concatenate(total_logits) total_labels = np.concatenate(total_labels) np.save(args.output_dir+"/logits.npy",total_logits) np.save(args.output_dir+"/labels.npy",total_labels) eval_loss = eval_loss / nb_eval_steps eval_accuracy = eval_accuracy / nb_eval_examples result = {'eval_loss': eval_loss, 'eval_accuracy': eval_accuracy} output_eval_file = os.path.join(args.output_dir, "eval_results.txt") with open(output_eval_file, "w") as writer: logger.info("***** Eval results *****") for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key])))
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument( "--data_dir", default=None, type=str, required=True, help= "The input data dir. Should contain the .csv files (or other data files) for the task." ) parser.add_argument("--vocab_file", default=None, type=str, required=True, help="The vocab file.") parser.add_argument( "--bert_model", default=None, type=str, required=True, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese." ) parser.add_argument( "--output_dir", default=None, type=str, required=True, help="The output directory where the model checkpoints will be written." ) ## Other parameters parser.add_argument( "--max_seq_length", default=128, type=int, help= "The maximum total input sequence length after WordPiece tokenization. \n" "Sequences longer than this will be truncated, and sequences shorter \n" "than this will be padded.") parser.add_argument( "--do_lower_case", default=False, action='store_true', help="Set this flag if you are using an uncased model.") parser.add_argument("--no_cuda", default=False, action='store_true', help="Whether not to use CUDA when available") parser.add_argument("--has_ans", default=True, action='store_true', help="Whether has answer in the eval dataset") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") args = parser.parse_args() if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(0) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logger.info("device: {} ({}) n_gpu: {}".format( device, torch.cuda.get_device_name(0), n_gpu)) tokenizer = BertTokenizer.from_pretrained(args.vocab_file, do_lower_case=args.do_lower_case) # Prepare model model = BertForMultipleChoice.from_pretrained(args.bert_model, num_choices=4) model.to(device) ## test filenames = os.listdir(args.data_dir) model.eval() eval_iter = tqdm(filenames, disable=False) eval_answers = {} eval_accuracy = 0 nb_eval_examples = 0 for fid, filename in enumerate(eval_iter): file_path = os.path.join(args.data_dir, filename) eval_examples = read_race_examples(file_path) eval_features = convert_examples_to_features(eval_examples, tokenizer, args.max_seq_length, True) all_input_ids = torch.tensor(select_field(eval_features, 'input_ids'), dtype=torch.long) all_input_mask = torch.tensor(select_field(eval_features, 'input_mask'), dtype=torch.long) all_segment_ids = torch.tensor(select_field(eval_features, 'segment_ids'), dtype=torch.long) all_label = torch.tensor([f.label for f in eval_features], dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label) # Run prediction for full data eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=1) eval_answer = [] for step, batch in enumerate(eval_dataloader): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, label_ids = batch with torch.no_grad(): logits = model(input_ids, segment_ids, input_mask) logits = logits.detach().cpu().numpy() eval_answer.extend(chr(np.argmax(logits, axis=1) + ord("A"))) if args.has_ans: label_ids = label_ids.to('cpu').numpy() tmp_eval_accuracy = accuracy(logits, label_ids) # print(label_ids, np.argmax(logits, axis=1), tmp_eval_accuracy) eval_accuracy += tmp_eval_accuracy nb_eval_examples += input_ids.size(0) eval_answers[filename] = eval_answer final_eval_accuracy = eval_accuracy / nb_eval_examples logger.info("eval accuracy: {}".format(final_eval_accuracy)) result = json.dumps(eval_answers) output_eval_file = os.path.join(args.output_dir, "answers.json") if not os.path.exists(args.output_dir): os.makedirs(args.output_dir, exist_ok=True) with open(output_eval_file, "w") as f: f.write(result)
def main(): ''' argument parsing ''' parser = argparse.ArgumentParser() ## Required parameters parser.add_argument( "--data_dir", default=None, type=str, required=True, help= "The input data dir. Should contain the .csv files (or other data files) for the task." ) parser.add_argument( "--bert_model", default=None, type=str, required=True, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, " "bert-base-multilingual-cased, bert-base-chinese.") parser.add_argument( "--output_dir", default=None, type=str, required=True, help="The output directory where the model checkpoints will be written." ) ## Other parameters parser.add_argument( "--max_seq_length", default=128, type=int, help= "The maximum total input sequence length after WordPiece tokenization. \n" "Sequences longer than this will be truncated, and sequences shorter \n" "than this will be padded.") parser.add_argument("--do_train", action='store_true', help="Whether to run training.") parser.add_argument("--do_eval", action='store_true', help="Whether to run eval on the dev set.") parser.add_argument( "--do_lower_case", action='store_true', help="Set this flag if you are using an uncased model.") parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.") parser.add_argument("--eval_batch_size", default=8, type=int, help="Total batch size for eval.") parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument( "--warmup_proportion", default=0.1, type=float, help= "Proportion of training to perform linear learning rate warmup for. " "E.g., 0.1 = 10%% of training.") parser.add_argument("--no_cuda", action='store_true', help="Whether not to use CUDA when available") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument( '--gradient_accumulation_steps', type=int, default=1, help= "Number of updates steps to accumulate before performing a backward/update pass." ) parser.add_argument( '--fp16', action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument( '--loss_scale', type=float, default=0, help= "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.\n") args = parser.parse_args() ''' end of argument parsing ''' ''' check whether gpu is available ''' if args.local_rank == -1 or args.no_cuda: # tell PyTorch to use GPU if one is available device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") # get number of gpu n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logger.info( "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}". format(device, n_gpu, bool(args.local_rank != -1), args.fp16)) ''' Gradient accumulation: split the batch of samples into several mini batches running a configured number of steps without updating the model variables while accumulating the gradients of those steps and then using the accumulated gradients to compute the variable updates ''' if args.gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(args.gradient_accumulation_steps)) # gradient_accumulation_steps: number of updates to make per mini batch args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps # set random seed random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) # end of setting seeds # check train/test mode if not args.do_train and not args.do_eval: raise ValueError( "At least one of `do_train` or `do_eval` must be True.") # check output directory if os.path.exists(args.output_dir) and os.listdir(args.output_dir): raise ValueError( "Output directory ({}) already exists and is not empty.".format( args.output_dir)) if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) # load Bert tokenizer tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) # Prepare model model = BertForMultipleChoice.from_pretrained( args.bert_model, cache_dir=os.path.join(str(PYTORCH_PRETRAINED_BERT_CACHE), 'distributed_{}'.format(args.local_rank)), num_choices=4) # additional configurations if args.fp16: model.half() # load model to gpu/cpu model.to(device) if args.local_rank != -1: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) model = DDP(model) elif n_gpu > 1: model = torch.nn.DataParallel(model) # if it is training mode if args.do_train: # Prepare data loader # train_examples: raw text for each field train_examples = read_swag_examples(os.path.join( args.data_dir, 'train.csv'), is_training=True) # list of InputFeatures objects train_features = convert_examples_to_features(train_examples, tokenizer, args.max_seq_length, True) all_input_ids = torch.tensor(select_field(train_features, 'input_ids'), dtype=torch.long) all_input_mask = torch.tensor(select_field(train_features, 'input_mask'), dtype=torch.long) all_segment_ids = torch.tensor(select_field(train_features, 'segment_ids'), dtype=torch.long) all_label = torch.tensor([f.label for f in train_features], dtype=torch.long) train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label) #??? if args.local_rank == -1: train_sampler = RandomSampler(train_data) else: train_sampler = DistributedSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) num_train_optimization_steps = len( train_dataloader ) // args.gradient_accumulation_steps * args.num_train_epochs if args.local_rank != -1: num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size( ) #??? # Prepare optimizer param_optimizer = list(model.named_parameters()) # hack to remove pooler, which is not used # thus it produce None grad that break apex param_optimizer = [n for n in param_optimizer] no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) ], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] if args.fp16: try: from apex.optimizers import FP16_Optimizer from apex.optimizers import FusedAdam except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) optimizer = FusedAdam(optimizer_grouped_parameters, lr=args.learning_rate, bias_correction=False, max_grad_norm=1.0) if args.loss_scale == 0: optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) else: optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale) warmup_linear = WarmupLinearSchedule( warmup=args.warmup_proportion, t_total=num_train_optimization_steps) else: optimizer = BertAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=num_train_optimization_steps) global_step = 0 logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_examples)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_optimization_steps) # TRAINING # model.train() for _ in trange(int(args.num_train_epochs), desc="Epoch"): tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 for step, batch in enumerate( tqdm(train_dataloader, desc="Iteration")): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, label_ids = batch loss = model(input_ids, segment_ids, input_mask, label_ids) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.fp16 and args.loss_scale != 1.0: # rescale loss for fp16 training # see https://docs.nvidia.com/deeplearning/sdk/mixed-precision-training/index.html loss = loss * args.loss_scale if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps tr_loss += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 if args.fp16: optimizer.backward(loss) else: # back propagation? loss.backward() if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16: # modify learning rate with special warm up BERT uses # if args.fp16 is False, BertAdam is used that handles this automatically lr_this_step = args.learning_rate * warmup_linear.get_lr( global_step, args.warmup_proportion) for param_group in optimizer.param_groups: param_group['lr'] = lr_this_step # back propagation optimizer.step() optimizer.zero_grad() global_step += 1 if args.do_train: # Save a trained model, configuration and tokenizer model_to_save = model.module if hasattr( model, 'module') else model # Only save the model it-self # If we save using the predefined names, we can load using `from_pretrained` output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME) output_config_file = os.path.join(args.output_dir, CONFIG_NAME) torch.save(model_to_save.state_dict(), output_model_file) model_to_save.config.to_json_file(output_config_file) tokenizer.save_vocabulary(args.output_dir) # Load a trained model and vocabulary that you have fine-tuned # num_choices determines the shape of logits, since the model only produce binary classification score model = BertForMultipleChoice.from_pretrained(args.output_dir, num_choices=4) tokenizer = BertTokenizer.from_pretrained( args.output_dir, do_lower_case=args.do_lower_case) else: model = BertForMultipleChoice.from_pretrained(args.bert_model, num_choices=4) # send model to gpu/cpu model.to(device) if args.do_eval and (args.local_rank == -1 or torch.distributed.get_rank() == 0): eval_examples = read_swag_examples(os.path.join( args.data_dir, 'val.csv'), is_training=True) eval_features = convert_examples_to_features(eval_examples, tokenizer, args.max_seq_length, True) logger.info("***** Running evaluation *****") logger.info(" Num examples = %d", len(eval_examples)) logger.info(" Batch size = %d", args.eval_batch_size) all_input_ids = torch.tensor(select_field(eval_features, 'input_ids'), dtype=torch.long) all_input_mask = torch.tensor(select_field(eval_features, 'input_mask'), dtype=torch.long) all_segment_ids = torch.tensor(select_field(eval_features, 'segment_ids'), dtype=torch.long) all_label = torch.tensor([f.label for f in eval_features], dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label) # Run prediction for full data eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) model.eval() eval_loss, eval_accuracy = 0, 0 nb_eval_steps, nb_eval_examples = 0, 0 for input_ids, input_mask, segment_ids, label_ids in tqdm( eval_dataloader, desc="Evaluating"): input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) label_ids = label_ids.to(device) with torch.no_grad(): tmp_eval_loss = model(input_ids, segment_ids, input_mask, label_ids) logits = model(input_ids, segment_ids, input_mask) # raw predictions logits = logits.detach().cpu().numpy() # shape of logits label_ids = label_ids.to('cpu').numpy() tmp_eval_accuracy = accuracy(logits, label_ids) eval_loss += tmp_eval_loss.mean().item() eval_accuracy += tmp_eval_accuracy nb_eval_examples += input_ids.size(0) nb_eval_steps += 1 eval_loss = eval_loss / nb_eval_steps eval_accuracy = eval_accuracy / nb_eval_examples result = { 'eval_loss': eval_loss, 'eval_accuracy': eval_accuracy, 'global_step': global_step, 'loss': tr_loss / global_step } output_eval_file = os.path.join(args.output_dir, "eval_results.txt") with open(output_eval_file, "w") as writer: logger.info("***** Eval results *****") for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key])))
def main(config, bert_vocab_file, bert_model_dir): if not os.path.exists(config.output_dir): os.makedirs(config.output_dir) if not os.path.exists(config.cache_dir): os.makedirs(config.cache_dir) output_model_file = os.path.join(config.output_dir, config.weights_name) # 模型输出文件 output_config_file = os.path.join(config.output_dir, config.config_name) # 设备准备 gpu_ids = [int(device_id) for device_id in config.gpu_ids.split()] device, n_gpu = get_device(gpu_ids[0]) if n_gpu > 1: n_gpu = len(gpu_ids) config.train_batch_size = config.train_batch_size // config.gradient_accumulation_steps """ 设定随机种子 """ random.seed(config.seed) np.random.seed(config.seed) torch.manual_seed(config.seed) if n_gpu > 0: torch.cuda.manual_seed_all(config.seed) tokenizer = BertTokenizer.from_pretrained( bert_vocab_file, do_lower_case=config.do_lower_case) label_list = ["0", "1", "2", "3"] if config.do_train: # 数据准备 train_file = os.path.join(config.data_dir, "train.json") dev_file = os.path.join(config.data_dir, "dev.json") train_dataloader, train_len = load_data(train_file, tokenizer, config.max_seq_length, config.train_batch_size) dev_dataloader, dev_len = load_data(dev_file, tokenizer, config.max_seq_length, config.dev_batch_size) num_train_steps = int(train_len / config.train_batch_size / config.gradient_accumulation_steps * config.num_train_epochs) # 模型准备 model = BertForMultipleChoice.from_pretrained( bert_model_dir, cache_dir=config.cache_dir, num_choices=4) model.to(device) if n_gpu > 1: model = torch.nn.DataParallel(model, device_ids=gpu_ids) # 优化器准备 param_optimizer = list(model.named_parameters()) param_optimizer = [n for n in param_optimizer if 'pooler' not in n[0]] no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) ], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] optimizer = BertAdam(optimizer_grouped_parameters, lr=config.learning_rate, warmup=config.warmup_proportion, t_total=num_train_steps) criterion = nn.CrossEntropyLoss() criterion = criterion.to(device) train(config.num_train_epochs, n_gpu, train_dataloader, dev_dataloader, model, optimizer, criterion, config.gradient_accumulation_steps, device, label_list, output_model_file, output_config_file, config.log_dir, config.print_step) test_file = os.path.join(config.data_dir, "test.json") test_dataloader, _ = load_data(test_file, tokenizer, config.max_seq_length, config.test_batch_size) bert_config = BertConfig(output_config_file) model = BertForMultipleChoice(bert_config, num_choices=len(label_list)) model.load_state_dict(torch.load(output_model_file)) model.to(device) """ 损失函数准备 """ criterion = nn.CrossEntropyLoss() criterion = criterion.to(device) test_loss, test_acc, test_report = evaluate(model, test_dataloader, criterion, device, label_list) print("-------------- Test -------------") print(f'\t Loss: {test_loss: .3f} | Acc: {test_acc*100: .3f} %') for label in label_list: print('\t {}: Precision: {} | recall: {} | f1 score: {}'.format( label, test_report[label]['precision'], test_report[label]['recall'], test_report[label]['f1-score'])) print_list = ['macro avg', 'weighted avg'] for label in print_list: print('\t {}: Precision: {} | recall: {} | f1 score: {}'.format( label, test_report[label]['precision'], test_report[label]['recall'], test_report[label]['f1-score']))
train_batch_size = int(train_batch_size / gradient_accumulation_steps) #Initialise seeds random.seed(seed) np.random.seed(seed) torch.manual_seed(seed) torch.cuda.manual_seed_all(seed) if n_gpu > 0: torch.cuda.manual_seed_all(seed) #Get Tokenizer tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True) # Prepare model model = BertForMultipleChoice.from_pretrained('bert-base-uncased', num_choices=5) """Move the model to GPU and also prints the model : 12 Encoder Layers 1 Pooler Layer 1 Dropout layer 1 Final Classifier layer """ if fp16: model.half() if n_gpu > 1:
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument("--data_dir", default=None, type=str, required=True, help="The input data dir. Should contain the .csv files (or other data files) for the task.") parser.add_argument("--bert_model", default=None, type=str, required=True, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese.") parser.add_argument("--output_dir", default=None, type=str, required=True, help="The output directory where the model checkpoints will be written.") ## Other parameters parser.add_argument("--max_seq_length", default=128, type=int, help="The maximum total input sequence length after WordPiece tokenization. \n" "Sequences longer than this will be truncated, and sequences shorter \n" "than this will be padded.") parser.add_argument("--do_train", default=False, action='store_true', help="Whether to run training.") parser.add_argument("--do_eval", default=False, action='store_true', help="Whether to run eval on the dev set.") parser.add_argument("--do_lower_case", default=False, action='store_true', help="Set this flag if you are using an uncased model.") parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.") parser.add_argument("--eval_batch_size", default=8, type=int, help="Total batch size for eval.") parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument("--warmup_proportion", default=0.1, type=float, help="Proportion of training to perform linear learning rate warmup for. " "E.g., 0.1 = 10%% of training.") parser.add_argument("--no_cuda", default=False, action='store_true', help="Whether not to use CUDA when available") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument('--gradient_accumulation_steps', type=int, default=1, help="Number of updates steps to accumulate before performing a backward/update pass.") parser.add_argument('--fp16', default=False, action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument('--loss_scale', type=float, default=0, help="Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.\n") args = parser.parse_args() if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logger.info("device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".format( device, n_gpu, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError("Invalid gradient_accumulation_steps parameter: {}, should be >= 1".format( args.gradient_accumulation_steps)) args.train_batch_size = int(args.train_batch_size / args.gradient_accumulation_steps) random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if not args.do_train and not args.do_eval: raise ValueError("At least one of `do_train` or `do_eval` must be True.") if os.path.exists(args.output_dir) and os.listdir(args.output_dir): raise ValueError("Output directory ({}) already exists and is not empty.".format(args.output_dir)) os.makedirs(args.output_dir, exist_ok=True) tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) train_examples = None num_train_steps = None if args.do_train: train_dir = os.path.join(args.data_dir, 'train') train_examples = read_race_examples(train_dir) num_train_steps = int( len(train_examples) / args.train_batch_size / args.gradient_accumulation_steps * args.num_train_epochs) # Prepare model model = BertForMultipleChoice.from_pretrained(args.bert_model, cache_dir=PYTORCH_PRETRAINED_BERT_CACHE / 'distributed_{}'.format(args.local_rank), num_choices=4) if args.fp16: model.half() model.to(device) if args.local_rank != -1: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.") model = DDP(model) elif n_gpu > 1: model = torch.nn.DataParallel(model) # Prepare optimizer param_optimizer = list(model.named_parameters()) # hack to remove pooler, which is not used # thus it produce None grad that break apex param_optimizer = [n for n in param_optimizer if 'pooler' not in n[0]] no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [ {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01}, {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} ] t_total = num_train_steps if args.local_rank != -1: t_total = t_total // torch.distributed.get_world_size() if args.fp16: try: from apex.optimizers import FP16_Optimizer from apex.optimizers import FusedAdam except ImportError: raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.") optimizer = FusedAdam(optimizer_grouped_parameters, lr=args.learning_rate, bias_correction=False, max_grad_norm=1.0) if args.loss_scale == 0: optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) else: optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale) else: optimizer = BertAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=t_total) global_step = 0 if args.do_train: train_features = convert_examples_to_features( train_examples, tokenizer, args.max_seq_length, True) logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_examples)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_steps) all_input_ids = torch.tensor(select_field(train_features, 'input_ids'), dtype=torch.long) all_input_mask = torch.tensor(select_field(train_features, 'input_mask'), dtype=torch.long) all_segment_ids = torch.tensor(select_field(train_features, 'segment_ids'), dtype=torch.long) all_label = torch.tensor([f.label for f in train_features], dtype=torch.long) train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label) if args.local_rank == -1: train_sampler = RandomSampler(train_data) else: train_sampler = DistributedSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) model.train() for ep in range(int(args.num_train_epochs)): tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 logger.info("Trianing Epoch: {}/{}".format(ep+1, int(args.num_train_epochs))) for step, batch in enumerate(train_dataloader): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, label_ids = batch loss = model(input_ids, segment_ids, input_mask, label_ids) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.fp16 and args.loss_scale != 1.0: # rescale loss for fp16 training # see https://docs.nvidia.com/deeplearning/sdk/mixed-precision-training/index.html loss = loss * args.loss_scale if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps tr_loss += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 if args.fp16: optimizer.backward(loss) else: loss.backward() if (step + 1) % args.gradient_accumulation_steps == 0: # modify learning rate with special warm up BERT uses lr_this_step = args.learning_rate * warmup_linear(global_step/t_total, args.warmup_proportion) for param_group in optimizer.param_groups: param_group['lr'] = lr_this_step optimizer.step() optimizer.zero_grad() global_step += 1 if global_step%100 == 0: logger.info("Training loss: {}, global step: {}".format(tr_loss/nb_tr_steps, global_step)) ## evaluate on dev set if global_step % 1000 == 0: dev_dir = os.path.join(args.data_dir, 'dev') #dev_set = [dev_dir+'/high', dev_dir+'/middle'] eval_examples = read_race_examples(dev_dir) eval_features = convert_examples_to_features( eval_examples, tokenizer, args.max_seq_length, True) logger.info("***** Running evaluation: Dev *****") logger.info(" Num examples = %d", len(eval_examples)) logger.info(" Batch size = %d", args.eval_batch_size) all_input_ids = torch.tensor(select_field(eval_features, 'input_ids'), dtype=torch.long) all_input_mask = torch.tensor(select_field(eval_features, 'input_mask'), dtype=torch.long) all_segment_ids = torch.tensor(select_field(eval_features, 'segment_ids'), dtype=torch.long) all_label = torch.tensor([f.label for f in eval_features], dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label) # Run prediction for full data eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) model.eval() eval_loss, eval_accuracy = 0, 0 nb_eval_steps, nb_eval_examples = 0, 0 for step, batch in enumerate(eval_dataloader): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, label_ids = batch with torch.no_grad(): tmp_eval_loss = model(input_ids, segment_ids, input_mask, label_ids) logits = model(input_ids, segment_ids, input_mask) logits = logits.detach().cpu().numpy() label_ids = label_ids.to('cpu').numpy() tmp_eval_accuracy = accuracy(logits, label_ids) eval_loss += tmp_eval_loss.mean().item() eval_accuracy += tmp_eval_accuracy nb_eval_examples += input_ids.size(0) nb_eval_steps += 1 eval_loss = eval_loss / nb_eval_steps eval_accuracy = eval_accuracy / nb_eval_examples result = {'dev_eval_loss': eval_loss, 'dev_eval_accuracy': eval_accuracy, 'global_step': global_step, 'loss': tr_loss/nb_tr_steps} output_eval_file = os.path.join(args.output_dir, "eval_results.txt") with open(output_eval_file, "a+") as writer: logger.info("***** Dev results *****") for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) # Save a trained model model_to_save = model.module if hasattr(model, 'module') else model # Only save the model it-self output_model_file = os.path.join(args.output_dir, "pytorch_model.bin") torch.save(model_to_save.state_dict(), output_model_file) ## Load a trained model that you have fine-tuned ## use this part if you want to load the trained model # model_state_dict = torch.load(output_model_file) # model = BertForMultipleChoice.from_pretrained(args.bert_model, # state_dict=model_state_dict, # num_choices=4) # model.to(device) if args.do_eval and (args.local_rank == -1 or torch.distributed.get_rank() == 0): test_dir = os.path.join(args.data_dir, 'test') ## test high eval_examples = read_race_examples(test_dir) eval_features = convert_examples_to_features( eval_examples, tokenizer, args.max_seq_length, True) logger.info("***** Running evaluation: test high *****") logger.info(" Num examples = %d", len(eval_examples)) logger.info(" Batch size = %d", args.eval_batch_size) all_input_ids = torch.tensor(select_field(eval_features, 'input_ids'), dtype=torch.long) all_input_mask = torch.tensor(select_field(eval_features, 'input_mask'), dtype=torch.long) all_segment_ids = torch.tensor(select_field(eval_features, 'segment_ids'), dtype=torch.long) all_label = torch.tensor([f.label for f in eval_features], dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label) # Run prediction for full data eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) model.eval() high_eval_loss, high_eval_accuracy = 0, 0 high_nb_eval_steps, high_nb_eval_examples = 0, 0 for step, batch in enumerate(eval_dataloader): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, label_ids = batch with torch.no_grad(): tmp_eval_loss = model(input_ids, segment_ids, input_mask, label_ids) logits = model(input_ids, segment_ids, input_mask) logits = logits.detach().cpu().numpy() label_ids = label_ids.to('cpu').numpy() tmp_eval_accuracy = accuracy(logits, label_ids) high_eval_loss += tmp_eval_loss.mean().item() high_eval_accuracy += tmp_eval_accuracy high_nb_eval_examples += input_ids.size(0) high_nb_eval_steps += 1 eval_loss = high_eval_loss / high_nb_eval_steps eval_accuracy = high_eval_accuracy / high_nb_eval_examples result = {'high_eval_loss': eval_loss, 'high_eval_accuracy': eval_accuracy} output_eval_file = os.path.join(args.output_dir, "eval_results.txt") with open(output_eval_file, "a+") as writer: logger.info("***** Eval results *****") for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) ## test middle eval_examples = read_race_examples(test_dir) eval_features = convert_examples_to_features( eval_examples, tokenizer, args.max_seq_length, True) logger.info("***** Running evaluation: test middle *****") logger.info(" Num examples = %d", len(eval_examples)) logger.info(" Batch size = %d", args.eval_batch_size) all_input_ids = torch.tensor(select_field(eval_features, 'input_ids'), dtype=torch.long) all_input_mask = torch.tensor(select_field(eval_features, 'input_mask'), dtype=torch.long) all_segment_ids = torch.tensor(select_field(eval_features, 'segment_ids'), dtype=torch.long) all_label = torch.tensor([f.label for f in eval_features], dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label) # Run prediction for full data eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) model.eval() middle_eval_loss, middle_eval_accuracy = 0, 0 middle_nb_eval_steps, middle_nb_eval_examples = 0, 0 for step, batch in enumerate(eval_dataloader): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, label_ids = batch with torch.no_grad(): tmp_eval_loss = model(input_ids, segment_ids, input_mask, label_ids) logits = model(input_ids, segment_ids, input_mask) logits = logits.detach().cpu().numpy() label_ids = label_ids.to('cpu').numpy() tmp_eval_accuracy = accuracy(logits, label_ids) middle_eval_loss += tmp_eval_loss.mean().item() middle_eval_accuracy += tmp_eval_accuracy middle_nb_eval_examples += input_ids.size(0) middle_nb_eval_steps += 1 eval_loss = middle_eval_loss / middle_nb_eval_steps eval_accuracy = middle_eval_accuracy / middle_nb_eval_examples result = {'middle_eval_loss': eval_loss, 'middle_eval_accuracy': eval_accuracy} with open(output_eval_file, "a+") as writer: for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) ## all test eval_loss = (middle_eval_loss + high_eval_loss) / (middle_nb_eval_steps + high_nb_eval_steps) eval_accuracy = (middle_eval_accuracy + high_eval_accuracy) / (middle_nb_eval_examples + high_nb_eval_examples) result = {'overall_eval_loss': eval_loss, 'overall_eval_accuracy': eval_accuracy} with open(output_eval_file, "a+") as writer: for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key])))
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument( "--data_dir", default=None, type=str, required=True, help= "The input data dir. Should contain the .csv files (or other data files) for the task." ) parser.add_argument( "--bert_model", default=None, type=str, required=True, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, " "bert-base-multilingual-cased, bert-base-chinese.") parser.add_argument( "--output_dir", default=None, type=str, required=True, help="The output directory where the model checkpoints will be written." ) ## Other parameters parser.add_argument( "--max_seq_length", default=128, type=int, help= "The maximum total input sequence length after WordPiece tokenization. \n" "Sequences longer than this will be truncated, and sequences shorter \n" "than this will be padded.") parser.add_argument("--do_train", action='store_true', help="Whether to run training.") parser.add_argument("--do_eval", action='store_true', help="Whether to run eval on the dev set.") parser.add_argument( "--do_lower_case", action='store_true', help="Set this flag if you are using an uncased model.") parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.") parser.add_argument("--eval_batch_size", default=8, type=int, help="Total batch size for eval.") parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument( "--warmup_proportion", default=0.1, type=float, help= "Proportion of training to perform linear learning rate warmup for. " "E.g., 0.1 = 10%% of training.") parser.add_argument("--no_cuda", action='store_true', help="Whether not to use CUDA when available") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument( '--gradient_accumulation_steps', type=int, default=1, help= "Number of updates steps to accumulate before performing a backward/update pass." ) parser.add_argument( '--loss_scale', type=float, default=0, help= "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.\n") args = parser.parse_args() device = torch.device( "cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") if args.gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(args.gradient_accumulation_steps)) args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if not args.do_train and not args.do_eval: raise ValueError( "At least one of `do_train` or `do_eval` must be True.") #print(args.output_dir) if os.path.exists(args.output_dir) and os.listdir(args.output_dir): ValueError( "Output directory ({}) already exists and is not empty.".format( args.output_dir)) if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) ##Prepare training datasets train_examples = None num_train_optimization_steps = None df_train = transfer(data_file=os.path.join(args.data_dir, 'mc500.train.tsv'), answer_file=os.path.join(args.data_dir, '/mc500.train.ans')) df_train['answer'] = df_train['answer'].apply(lambda x: ord(x) - 65) train_examples = read_MCtest(df_train) train_features = convert_examples_to_features(train_examples, tokenizer, args.max_seq_length, True) logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_examples)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_optimization_steps) all_input_ids = torch.tensor(select_field(train_features, 'input_ids'), dtype=torch.long) all_input_mask = torch.tensor(select_field(train_features, 'input_mask'), dtype=torch.long) all_segment_ids = torch.tensor(select_field(train_features, 'segment_ids'), dtype=torch.long) all_label = torch.tensor([f.label for f in train_features], dtype=torch.long) train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label) train_sampler = RandomSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) print(len(train_examples), args.train_batch_size, args.gradient_accumulation_steps) num_train_optimization_steps = int( len(train_examples) / args.train_batch_size / args.gradient_accumulation_steps) * args.num_train_epochs ##Prepare evaluate datasets df_eval = transfer(data_file=os.path.join(args.data_dir, 'mc500.dev.tsv'), answer_file=os.path.join(args.data_dir, '/mc500.dev.ans')) df_eval['answer'] = df_eval['answer'].apply(lambda x: ord(x) - 65) eval_examples = read_MCtest(df_eval) # eval_examples = read_race(os.path.join(args.data_dir,"dev")) eval_features = convert_examples_to_features(eval_examples, tokenizer, args.max_seq_length, True) logger.info("***** Running evaluation *****") logger.info(" Num examples = %d", len(eval_examples)) logger.info(" Batch size = %d", args.eval_batch_size) all_input_ids = torch.tensor(select_field(eval_features, 'input_ids'), dtype=torch.long) all_input_mask = torch.tensor(select_field(eval_features, 'input_mask'), dtype=torch.long) all_segment_ids = torch.tensor(select_field(eval_features, 'segment_ids'), dtype=torch.long) all_label = torch.tensor([f.label for f in eval_features], dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label) # Run prediction for full data eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) # Prepare model model = BertForMultipleChoice.from_pretrained( args.bert_model, cache_dir=PYTORCH_PRETRAINED_BERT_CACHE, num_choices=4) model.to(device) #Freeze the embedding network and encoder layers print("Freeze network") for name, param in model.named_parameters(): ln = 24 if name.startswith('bert.encoder'): l = name.split('.') ln = int(l[3]) if name.startswith('bert.embeddings') or ln < 6: print(name) param.requires_grad = False # Prepare optimizer param_optimizer = list(model.named_parameters()) # hack to remove pooler, which is not used # thus it produce None grad that break apex param_optimizer = [n for n in param_optimizer if 'pooler' not in n[0]] no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] optimizer = BertAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=num_train_optimization_steps) if args.do_train: train(args, model, train_dataloader, optimizer) if args.do_train: # Save a trained model, configuration and tokenizer model_to_save = model.module if hasattr( model, 'module') else model # Only save the model it-self # If we save using the predefined names, we can load using `from_pretrained` output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME) output_config_file = os.path.join(args.output_dir, CONFIG_NAME) torch.save(model_to_save.state_dict(), output_model_file) model_to_save.config.to_json_file(output_config_file) tokenizer.save_vocabulary(args.output_dir) # Load a trained model and vocabulary that you have fine-tuned model = BertForMultipleChoice.from_pretrained(args.output_dir, num_choices=4) tokenizer = BertTokenizer.from_pretrained( args.output_dir, do_lower_case=args.do_lower_case) else: output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME) model = BertForMultipleChoice.from_pretrained(args.output_dir, num_choices=4) tokenizer = BertTokenizer.from_pretrained( args.output_dir, do_lower_case=args.do_lower_case) model.to(device) if args.do_eval: evaluate(args, model, eval_dataloader)
def main(): # Parameters #The path to data data_dir = 'RACE' #The BERT model to train, can be amongst bert-base-uncased, bert-large-uncased, bert-base-cased bert_model = 'bert-base-uncased' # The output directory to store model checkpoints output_dir = 'large_models' # The max total input length after tokenization # Longer Sequences are truncated # Shorter ones are padded max_seq_length = 200 # The batch size (read https://github.com/google-research/bert/issues/38 to realize why 8 actually means 32 to the model) # On high level 4 options are taken as different inputs to the model, so 8*4 = 32 train_batch_size = 4 # The learning rate learning_rate = 1e-5 # Number of epochs print(data['id'] + " processed") num_epochs = 3 # Gradient accumulation steps, loss scale and device setup gradient_accumulation_steps = 4 loss_scale = 128 warmup_proportion = 0.1 resume = False device = torch.device("cuda" if torch.cuda.is_available() else "cpu") #device = "cpu" os.makedirs(output_dir, exist_ok=True) # Build the tokenizer tokenizer = BertTokenizer.from_pretrained(bert_model, do_lower_case=True) # Process the dataset and store it once computed if os.path.isfile('train_data_processed.txt'): print('Loading train data') with open('train_data_processed.txt', "rb") as fp: train_data = pickle.load(fp) print('Loaded train data') else: print('Processing train data') train_data = process_race(data_dir + '/', 'train/') with open('train_data_processed.txt', "wb") as fp: pickle.dump(train_data, fp) print('Processed train data') # Find features and store it once computed if os.path.isfile('train_features.txt'): print('Loading train features') with open('train_features.txt', "rb") as fp: train_features = pickle.load(fp) print('Loaded train features') else: print('Converting to features') train_features = convert_to_bert_style(train_data, tokenizer, max_seq_length) with open('train_features.txt', 'wb') as fp: pickle.dump(train_features, fp) print('Converted to features') train_batch_size = int(train_batch_size / gradient_accumulation_steps) # Find the total number of steps for training num_train_steps = int( len(train_data) / train_batch_size / gradient_accumulation_steps * num_epochs) # Initialize the model model = BertForMultipleChoice.from_pretrained( bert_model, cache_dir=PYTORCH_PRETRAINED_BERT_CACHE / 'model', num_choices=4) model.to(device) global_step = 0 parameters = get_params(model) optimizer = BertAdam(parameters, lr=learning_rate, warmup=warmup_proportion, t_total=num_train_steps) train_data = prepare_data(train_features) sampler = RandomSampler(train_data) train_dataloader = DataLoader(train_data, sampler=sampler, batch_size=train_batch_size) for epochs in range(num_epochs): training_loss = 0 training_accuracy = 0 num_training_examples, num_training_steps = 0, 0 logger.info("Training Epoch: {}/{}".format(epochs + 1, int(num_epochs))) model.train() for step, batch in enumerate(train_dataloader): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, label_ids = batch loss = model(input_ids, segment_ids, input_mask, label_ids) loss = loss / gradient_accumulation_steps training_loss += loss.item() num_training_examples += input_ids.size(0) num_training_steps += 1 logits = model(input_ids, segment_ids, input_mask) tmp_train_accuracy = accuracy(logits.detach().cpu().numpy(), label_ids.to('cpu').numpy()) training_accuracy += tmp_train_accuracy loss.backward() if (step + 1) % gradient_accumulation_steps == 0: lr_this_step = learning_rate * warmup_linear( global_step / num_train_steps, warmup_proportion) for param_group in optimizer.param_groups: param_group['lr'] = lr_this_step optimizer.step() optimizer.zero_grad() global_step += 1 del batch if global_step % 100 == 0: logger.info("Training loss: {}, global step: {}".format( training_loss / num_training_steps, global_step)) logger.info("Training Accuracy: {}, epoch: {}".format( training_accuracy / num_training_examples, epochs)) model_to_save = model.module if hasattr( model, 'module') else model # Only save the model it-self output_model_file = os.path.join( output_dir, "pytorch_model_{}epochs.bin".format(global_step)) torch.save(model.state_dict(), output_model_file) ## evaluate on dev set dev_data = process_race(data_dir + '/', 'dev/') dev_features = convert_to_bert_style(dev_data, tokenizer, max_seq_length) dev_data = prepare_data(dev_features) dev_sampler = SequentialSampler(dev_data) dev_dataloader = DataLoader(dev_data, sampler=dev_sampler, batch_size=1) model.eval() dev_loss, dev_accuracy = 0, 0 num_eval_examples, num_eval_steps = 0, 0 for step, batch in enumerate(dev_dataloader): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, label_ids = batch with torch.no_grad(): tmp_eval_loss = model(input_ids, segment_ids, input_mask, label_ids) logits = model(input_ids, segment_ids, input_mask) logits = logits.detach().cpu().numpy() label_ids = label_ids.to('cpu').numpy() tmp_eval_accuracy = accuracy(logits, label_ids) dev_loss += tmp_eval_loss.mean().item() dev_accuracy += tmp_eval_accuracy num_eval_examples += input_ids.size(0) num_eval_steps += 1 del batch dev_loss = dev_loss / num_eval_steps dev_accuracy = dev_accuracy / num_eval_examples result = { 'dev_loss': dev_loss, 'dev_accuracy': dev_accuracy, 'global_step': global_step, 'loss': training_loss / num_training_steps } output_eval_file = os.path.join(output_dir, "dev_results.txt") with open(output_eval_file, "a+") as writer: logger.info("***** Eval results *****") for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key])))
def __init__(self,output_dir,model="best_model.bin",topk=1, bert_model="bert-large-cased",do_lower_case=False,train_batch_size=32,seed=42, eval_batch_size=64,max_seq_length=128,num_labels=4,grad_acc_steps=1, num_of_epochs=5,learning_rate=1e-5,warmup_proportion=0.1,action="train",fp16=False,loss_scale=0): print("Loading BERT QA Model") self.name = "BertQA" device = torch.device("cuda" if torch.cuda.is_available() else "cpu") n_gpu = torch.cuda.device_count() tokenizer = BertTokenizer.from_pretrained(bert_model,do_lower_case=do_lower_case) self.topk=topk self.device = device self.tokenizer=tokenizer self.max_seq_length = max_seq_length self.train_batch_size = int(train_batch_size / grad_acc_steps) self.eval_batch_size=eval_batch_size self.num_labels=num_labels self.grad_acc_steps=grad_acc_steps self.gradient_accumulation_steps = self.grad_acc_steps self.num_of_epochs = num_of_epochs self.output_dir = output_dir self.learning_rate = learning_rate self.warmup_proportion = warmup_proportion self.n_gpu = n_gpu self.fp16=fp16 self.loss_scale=loss_scale print("Action:",action) if action == "train": print("Params:") print("Device:",device) print("Max-Seq-Length:",max_seq_length) print("Train-Batch-Size:",train_batch_size) print("Num Labels:",num_labels) print("Gradient Accumulation Steps:",grad_acc_steps) print("Num of Train Epochs:",num_of_epochs) print("Output Dir:",output_dir) print("Learning Rate:",learning_rate) print("Warmup Proportion:",warmup_proportion) print("Number of Gpus:",n_gpu) os.makedirs(self.output_dir, exist_ok=True) model = BertForMultipleChoice.from_pretrained(bert_model, cache_dir=PYTORCH_PRETRAINED_BERT_CACHE / 'distributed_{}'.format(-1), num_choices = num_labels) if self.fp16: print("Model: FP16") model.half() model.to(device) model = torch.nn.DataParallel(model) self.model = model self.best_metric = None if action == "predict": output_model_file = os.path.join(output_dir,model) # Load a trained model that you have fine-tuned self.train_batch_size=80 print("Loading Model",output_model_file) model_state_dict = torch.load(output_model_file) model = BertForMultipleChoice.from_pretrained(bert_model, state_dict=model_state_dict, num_choices=num_labels) model.to(device) model = torch.nn.DataParallel(model) self.model = model
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument( "--data_dir", default=None, type=str, required=True, help= "The input data dir. Should contain the .tsv files (or other data files) for the task." ) parser.add_argument( "--bert_model", default=None, type=str, required=True, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese." ) parser.add_argument("--task_name", default=None, type=str, required=True, help="The name of the task to train.") parser.add_argument( "--output_dir", default=None, type=str, required=True, help="The output directory where the model checkpoints will be written." ) ## Other parameters parser.add_argument( "--max_seq_length", default=128, type=int, help= "The maximum total input sequence length after WordPiece tokenization. \n" "Sequences longer than this will be truncated, and sequences shorter \n" "than this will be padded.") parser.add_argument("--do_train", default=False, action='store_true', help="Whether to run training.") parser.add_argument("--do_eval", default=False, action='store_true', help="Whether to run eval on the dev set.") parser.add_argument("--do_predict", default=False, action='store_true', help="Whether to run prediction on a given dataset.") parser.add_argument("--input_file_for_pred", default=None, type=str, help="File to run prediction on.") parser.add_argument("--output_file_for_pred", default=None, type=str, help="File to output predictions into.") parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.") parser.add_argument("--eval_batch_size", default=8, type=int, help="Total batch size for eval.") parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument( "--warmup_proportion", default=0.1, type=float, help= "Proportion of training to perform linear learning rate warmup for. " "E.g., 0.1 = 10%% of training.") parser.add_argument("--no_cuda", default=False, action='store_true', help="Whether not to use CUDA when available") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument( '--gradient_accumulation_steps', type=int, default=1, help= "Number of updates steps to accumualte before performing a backward/update pass." ) parser.add_argument( '--optimize_on_cpu', default=False, action='store_true', help= "Whether to perform optimization and keep the optimizer averages on CPU" ) parser.add_argument( '--fp16', default=False, action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument( '--loss_scale', type=float, default=128, help= 'Loss scaling, positive power of 2 values can improve fp16 convergence.' ) args = parser.parse_args() processors = { "cola": ColaProcessor, "mnli": MnliProcessor, "mrpc": MrpcProcessor, "anli": AnliProcessor, "anli3": AnliProcessor3Option, 'anli_csk': AnliWithCSKProcessor, 'bin_anli': BinaryAnli, 'wsc': WSCProcessor } if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') if args.fp16: logger.info( "16-bits training currently not supported in distributed training" ) args.fp16 = False # (see https://github.com/pytorch/pytorch/pull/13496) logger.info("device %s n_gpu %d distributed training %r", device, n_gpu, bool(args.local_rank != -1)) if args.gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(args.gradient_accumulation_steps)) args.train_batch_size = int(args.train_batch_size / args.gradient_accumulation_steps) random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if not args.do_train and not args.do_eval: raise ValueError( "At least one of `do_train` or `do_eval` must be True.") if os.path.exists(args.output_dir) and os.listdir( args.output_dir) and args.do_train: raise ValueError( "Output directory ({}) already exists and is not empty.".format( args.output_dir)) os.makedirs(args.output_dir, exist_ok=True) task_name = args.task_name.lower() if task_name not in processors: raise ValueError("Task not found: %s" % (task_name)) processor = processors[task_name]() label_list = processor.get_labels() tokenizer = BertTokenizer.from_pretrained(args.bert_model) train_examples = None num_train_steps = None if args.do_train: train_examples = processor.get_train_examples(args.data_dir) num_train_steps = int( len(train_examples) / args.train_batch_size / args.gradient_accumulation_steps * args.num_train_epochs) # Prepare model if task_name == 'bin_anli': model = BertForSequenceClassification.from_pretrained( args.bert_model, len(label_list)) else: model = BertForMultipleChoice.from_pretrained(args.bert_model, len(label_list), len(label_list)) if args.fp16: model.half() model.to(device) if args.local_rank != -1: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.local_rank], output_device=args.local_rank) elif n_gpu > 1: model = torch.nn.DataParallel(model) # Prepare optimizer if args.fp16: param_optimizer = [(n, param.clone().detach().to('cpu').float().requires_grad_()) \ for n, param in model.named_parameters()] elif args.optimize_on_cpu: param_optimizer = [(n, param.clone().detach().to('cpu').requires_grad_()) \ for n, param in model.named_parameters()] else: param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'gamma', 'beta'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if n not in no_decay], 'weight_decay_rate': 0.01 }, { 'params': [p for n, p in param_optimizer if n in no_decay], 'weight_decay_rate': 0.0 }] optimizer = BertAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=num_train_steps) global_step = 0 model_save_path = os.path.join(args.output_dir, "bert-finetuned.model") tr_loss = None if args.do_train: if task_name.lower().startswith( "anli") or task_name.lower().startswith("wsc"): train_features = convert_examples_to_features_mc( train_examples, label_list, args.max_seq_length, tokenizer) else: train_features = convert_examples_to_features( train_examples, label_list, args.max_seq_length, tokenizer) logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_examples)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_steps) all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long) all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.long) train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) if args.local_rank == -1: train_sampler = RandomSampler(train_data) else: train_sampler = DistributedSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) model.train() for _ in trange(int(args.num_train_epochs), desc="Epoch"): tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 status_tqdm = tqdm(train_dataloader, desc="Iteration") for step, batch in enumerate(status_tqdm): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, label_ids = batch loss, _ = model(input_ids, segment_ids, input_mask, label_ids) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.fp16 and args.loss_scale != 1.0: # rescale loss for fp16 training # see https://docs.nvidia.com/deeplearning/sdk/mixed-precision-training/index.html loss = loss * args.loss_scale if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps loss.backward() tr_loss += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16 or args.optimize_on_cpu: if args.fp16 and args.loss_scale != 1.0: # scale down gradients for fp16 training for param in model.parameters(): param.grad.data = param.grad.data / args.loss_scale is_nan = set_optimizer_params_grad( param_optimizer, model.named_parameters(), test_nan=True) if is_nan: logger.info( "FP16 TRAINING: Nan in gradients, reducing loss scaling" ) args.loss_scale = args.loss_scale / 2 model.zero_grad() continue optimizer.step() copy_optimizer_params_to_model( model.named_parameters(), param_optimizer) else: optimizer.step() model.zero_grad() global_step += 1 status_tqdm.set_description_str( "Iteration / Training Loss: {}".format( (tr_loss / nb_tr_examples))) torch.save(model, model_save_path) if args.do_eval: if args.do_predict and args.input_file_for_pred is not None: eval_examples = processor.get_examples_from_file( args.input_file_for_pred) else: eval_examples = processor.get_dev_examples(args.data_dir) if task_name.lower().startswith( "anli") or task_name.lower().startswith("wsc"): eval_features = convert_examples_to_features_mc( eval_examples, label_list, args.max_seq_length, tokenizer) else: eval_features = convert_examples_to_features( eval_examples, label_list, args.max_seq_length, tokenizer) logger.info("***** Running evaluation *****") logger.info(" Num examples = %d", len(eval_examples)) logger.info(" Batch size = %d", args.eval_batch_size) all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long) all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) if args.local_rank == -1: eval_sampler = SequentialSampler(eval_data) else: eval_sampler = DistributedSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) logger.info( "***** Loading model from: {} *****".format(model_save_path)) model = torch.load(model_save_path) model.eval() eval_loss, eval_accuracy = 0, 0 nb_eval_steps, nb_eval_examples = 0, 0 eval_predictions = [] eval_pred_probs = [] logger.info("***** Predicting ... *****".format(model_save_path)) for input_ids, input_mask, segment_ids, label_ids in tqdm( eval_dataloader): input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) label_ids = label_ids.to(device) with torch.no_grad(): tmp_eval_loss, logits = model(input_ids, segment_ids, input_mask, label_ids) logits = logits.detach().cpu().numpy() label_ids = label_ids.to('cpu').numpy() tmp_eval_accuracy = accuracy(logits, label_ids) eval_predictions.extend(np.argmax(logits, axis=1).tolist()) eval_pred_probs.extend([_compute_softmax(list(l)) for l in logits]) eval_loss += tmp_eval_loss.mean().item() eval_accuracy += tmp_eval_accuracy nb_eval_examples += input_ids.size(0) nb_eval_steps += 1 eval_loss = eval_loss / nb_eval_steps eval_accuracy = eval_accuracy / nb_eval_examples result = { 'eval_loss': eval_loss, 'eval_accuracy': eval_accuracy, 'global_step': global_step, 'loss': tr_loss / nb_tr_steps if tr_loss is not None else 0.0 } output_eval_file = os.path.join(args.output_dir, "eval_results.txt") with open(output_eval_file, "w") as writer: logger.info("***** Eval results *****") for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) if task_name == "wsc": pred_examples = list(TsvIO.read(args.input_file_for_pred)) else: pred_examples = read_jsonl_lines(args.input_file_for_pred) logger.info("***** Eval predictions *****") for record, pred, probs in zip(pred_examples, eval_predictions, eval_pred_probs): record['bert_prediction'] = pred record['bert_correct'] = pred == ( int(record[processor.label_field()]) - 1) record['bert_pred_probs'] = probs write_items([json.dumps(r) for r in pred_examples], args.output_file_for_pred)
from tqdm import tqdm, trange from pytorch_pretrained_bert.file_utils import PYTORCH_PRETRAINED_BERT_CACHE from pytorch_pretrained_bert.modeling import BertForMultipleChoice from pytorch_pretrained_bert.optimization import BertAdam, warmup_linear from pytorch_pretrained_bert.tokenization import BertTokenizer # In[2]: tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True) model_state_dict = torch.load( "/home/ting/pytorch-pretrained-BERT/base_models/weights-bert-base-uncased") model = BertForMultipleChoice.from_pretrained('bert-base-uncased', state_dict=model_state_dict, num_choices=4) model.cuda().eval() # # API for calling when questions are put into Exam Keeper # In[3]: def feedexample(question): example = SwagExample(swag_id=question['id'], context_sentence=question['context_sentence'], start_ending=question['start_ending'], ending_0=question['ending_0'], ending_1=question['ending_1'], ending_2=question['ending_2'],
class ReadingComprehension(): #Reading Comprehension from run_race import RaceExample, convert_examples_to_features, select_field, accuracy import os import json import numpy as np import torch from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler from torch.utils.data.distributed import DistributedSampler from pytorch_pretrained_bert.tokenization import BertTokenizer from pytorch_pretrained_bert.modeling import BertForMultipleChoice, BertModel ############## from pytorch_pretrained_bert.file_utils import PYTORCH_PRETRAINED_BERT_CACHE import functools # In[2]: tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True) model_state_dict = torch.load( "/home/ting/BERT-RACE/base_models/pytorch_model.bin") model = BertForMultipleChoice.from_pretrained( 'bert-base-uncased', # cache_dir=PYTORCH_PRETRAINED_BERT_CACHE / 'distributed_{}'.format(-1), state_dict=model_state_dict, num_choices=4) # In[3]: model.cuda().eval() # # API for calling when questions are put into Exam Keeper # In[4]: import time BATCH_SIZE = 8 def feedexample(question): example = RaceExample(race_id=question['id'], context_sentence=question['article'], start_ending=question['question'], ending_0=question['options'][0], ending_1=question['options'][1], ending_2=question['options'][2], ending_3=question['options'][3], label=question['truth']) return example def reading_comprehension_api(questions): eval_examples = [feedexample(question) for question in questions] eval_features = convert_examples_to_features(eval_examples, tokenizer, 512, True) all_input_ids = torch.tensor(select_field(eval_features, 'input_ids'), dtype=torch.long) all_input_mask = torch.tensor(select_field(eval_features, 'input_mask'), dtype=torch.long) all_segment_ids = torch.tensor(select_field(eval_features, 'segment_ids'), dtype=torch.long) all_label = torch.tensor([f.label for f in eval_features], dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label) # Run prediction for full data eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=BATCH_SIZE) output_probability = torch.zeros((len(eval_features), 4)) for step, batch in enumerate(eval_dataloader): start = time.time() batch = tuple(t.cuda() for t in batch) input_ids, input_mask, segment_ids, label_ids = batch with torch.no_grad(): tmp_eval_loss = model(input_ids, segment_ids, input_mask, label_ids) logits = model(input_ids, segment_ids, input_mask) logits = logits.detach().cpu() end = time.time() # print(end - start) output_probability[step * BATCH_SIZE:(step + 1) * BATCH_SIZE] = torch.softmax(logits, 1) return output_probability
def main(MARGIN=0.15): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument( "--data_dir", default=None, type=str, required=True, help= "The input data dir. Should contain the .csv files (or other data files) for the task." ) parser.add_argument( "--bert_model", default=None, type=str, required=True, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese." ) parser.add_argument( "--output_dir", default=None, type=str, required=True, help="The output directory where the model checkpoints will be written." ) ## Other parameters parser.add_argument( "--max_seq_length", default=128, type=int, help= "The maximum total input sequence length after WordPiece tokenization. \n" "Sequences longer than this will be truncated, and sequences shorter \n" "than this will be padded.") parser.add_argument("--do_train", default=False, action='store_true', help="Whether to run training.") parser.add_argument("--do_eval", default=False, action='store_true', help="Whether to run eval on the dev set.") parser.add_argument("--do_test", default=False, action='store_true', help="Whether to run eval on the dev set.") parser.add_argument( "--do_lower_case", default=True, action='store_true', help="Set this flag if you are using an uncased model.") parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.") parser.add_argument("--eval_batch_size", default=40, type=int, help="Total batch size for eval.") parser.add_argument("--do_margin_loss", default=0, type=int, help="Use margin loss or log-loss.") parser.add_argument("--do_prediction", default=0, type=int, help="Use margin loss or log-loss.") parser.add_argument("--learning_rate", default=6.25e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--num_train_epochs", default=3, type=int, help="Total number of training epochs to perform.") parser.add_argument( "--warmup_proportion", default=0.1, type=float, help= "Proportion of training to perform linear learning rate warmup for. " "E.g., 0.1 = 10%% of training.") parser.add_argument("--no_cuda", default=False, action='store_true', help="Whether not to use CUDA when available") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument( '--gradient_accumulation_steps', type=int, default=1, help= "Number of updates steps to accumulate before performing a backward/update pass." ) parser.add_argument( '--optimize_on_cpu', default=False, action='store_true', help= "Whether to perform optimization and keep the optimizer averages on CPU" ) parser.add_argument( '--fp16', default=False, action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument( '--loss_scale', type=float, default=128, help= 'Loss scaling, positive power of 2 values can improve fp16 convergence.' ) parser.add_argument('--margin', type=float, default=0.15, help='Margin value used in the MultiMarginLoss.') # parser.add_argument('--gpuid', type=int, default=-1,help='The gpu id to use') args = parser.parse_args() if args.local_rank == -1 or args.no_cuda: if not args.no_cuda: # device = torch.device("cuda",args.gpuid) # torch.cuda.set_device(args.gpuid) dummy = torch.cuda.FloatTensor(1) else: device = torch.device("cpu") n_gpu = 1 # n_gpu = torch.cuda.device_count() else: device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') if args.fp16: logger.info( "16-bits training currently not supported in distributed training" ) args.fp16 = False # (see https://github.com/pytorch/pytorch/pull/13496) # logger.info("device %s n_gpu %d distributed training %r", device, n_gpu, bool(args.local_rank != -1)) if args.gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(args.gradient_accumulation_steps)) args.train_batch_size = int(args.train_batch_size / args.gradient_accumulation_steps) random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if not args.do_train and not args.do_eval: raise ValueError( "At least one of `do_train` or `do_eval` must be True.") # if os.path.exists(args.output_dir) and os.listdir(args.output_dir): # raise ValueError("Output directory ({}) already exists and is not empty.".format(args.output_dir)) os.makedirs(args.output_dir, exist_ok=True) tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) train_examples = None eval_examples = None num_train_steps = None eval_size = 0 if args.do_train: # train_examples = read_roc_examples(os.path.join(args.data_dir, 'val.csv'), is_training = True) train_examples = read_roc_examples(os.path.join( args.data_dir, 'test.csv'), is_training=True) # train_examples = read_roc_examples(os.path.join(args.data_dir, 'val_test.csv'), is_training = True) # train_examples = read_roc_examples(os.path.join(args.data_dir, 'val_test_valnew.csv'), is_training = True) if args.do_eval: eval_examples = read_roc_examples(os.path.join( args.data_dir, 'valnew.csv'), is_training=True) # train_examples=train_examples+eval_examples[0:1000] # eval_examples=eval_examples[1000:] eval_size = len(eval_examples) print(len(train_examples), len(eval_examples)) num_train_steps = int( len(train_examples) / args.train_batch_size / args.gradient_accumulation_steps * args.num_train_epochs) else: num_train_steps = int( len(train_examples) / args.train_batch_size / args.gradient_accumulation_steps * args.num_train_epochs) # Prepare model if args.do_margin_loss == 0: model = BertForMultipleChoice.from_pretrained( args.bert_model, cache_dir=PYTORCH_PRETRAINED_BERT_CACHE / 'distributed_{}'.format(args.local_rank), num_choices=2) elif args.do_margin_loss == 1: model = BertForMultipleChoiceMarginLoss.from_pretrained( args.bert_model, cache_dir=PYTORCH_PRETRAINED_BERT_CACHE / 'distributed_{}'.format(args.local_rank), num_choices=2, margin=args.margin) if args.fp16: model.half() if not args.no_cuda: model.cuda() if args.local_rank != -1: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.local_rank], output_device=args.local_rank) elif n_gpu > 1: model = torch.nn.DataParallel(model) # Prepare optimizer if args.fp16: param_optimizer = [(n, param.clone().detach().to('cpu').float().requires_grad_()) \ for n, param in model.named_parameters()] elif args.optimize_on_cpu: param_optimizer = [(n, param.clone().detach().to('cpu').requires_grad_()) \ for n, param in model.named_parameters()] else: param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'gamma', 'beta'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay_rate': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay_rate': 0.0 }] t_total = num_train_steps if args.local_rank != -1: t_total = t_total // torch.distributed.get_world_size() optimizer = BertAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=t_total) if args.do_eval and (args.local_rank == -1 or torch.distributed.get_rank() == 0): # eval_examples = read_roc_examples(os.path.join(args.data_dir, 'valnew.csv'), is_training = True) # eval_examples = read_roc_examples(os.path.join(args.data_dir, 'test.csv'), is_training = True) eval_features = convert_examples_to_features(eval_examples, tokenizer, args.max_seq_length, True) # logger.info("***** Running evaluation *****") # logger.info(" Num examples = %d", len(eval_examples)) args.eval_batch_size = args.train_batch_size # logger.info(" Batch size = %d", args.eval_batch_size) all_input_ids = torch.tensor(select_field(eval_features, 'input_ids'), dtype=torch.long) all_input_mask = torch.tensor(select_field(eval_features, 'input_mask'), dtype=torch.long) all_segment_ids = torch.tensor(select_field(eval_features, 'segment_ids'), dtype=torch.long) all_label = torch.tensor([f.label for f in eval_features], dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label) # Run prediction for full data eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) # do_evaluation(model,eval_dataloader,args) if args.do_test and (args.local_rank == -1 or torch.distributed.get_rank() == 0): test_examples = read_roc_examples(os.path.join(args.data_dir, 'testnew.csv'), is_training=False) test_features = convert_examples_to_features(test_examples, tokenizer, args.max_seq_length, True) # logger.info("***** Running testuation *****") # logger.info(" Num examples = %d", len(test_examples)) # logger.info(" Batch size = %d", args.test_batch_size) all_input_ids = torch.tensor(select_field(test_features, 'input_ids'), dtype=torch.long) all_input_mask = torch.tensor(select_field(test_features, 'input_mask'), dtype=torch.long) all_segment_ids = torch.tensor(select_field(test_features, 'segment_ids'), dtype=torch.long) all_label = torch.tensor([f.label for f in test_features], dtype=torch.long) test_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label) # Run prediction for full data test_sampler = SequentialSampler(test_data) test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=args.eval_batch_size) # do_evaluation(model,eval_dataloader,args) global_step = 0 if args.do_train: train_features = convert_examples_to_features(train_examples, tokenizer, args.max_seq_length, True) # logger.info("***** Running training *****") # logger.info(" Num examples = %d", len(train_examples)) # logger.info(" Batch size = %d", args.train_batch_size) # logger.info(" Num steps = %d", num_train_steps) all_input_ids = torch.tensor(select_field(train_features, 'input_ids'), dtype=torch.long) all_input_mask = torch.tensor(select_field(train_features, 'input_mask'), dtype=torch.long) all_segment_ids = torch.tensor(select_field(train_features, 'segment_ids'), dtype=torch.long) all_label = torch.tensor([f.label for f in train_features], dtype=torch.long) train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label) if args.local_rank == -1: train_sampler = RandomSampler(train_data) else: train_sampler = DistributedSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) best_eval_acc = 0.0 best_step = 0 for epoch in trange(int(args.num_train_epochs), desc="Epoch"): # for epoch in range(int(args.num_train_epochs)): print("Epoch:", epoch) tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 # for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration")): for step, batch in enumerate(train_dataloader): model.train() batch = tuple(t.cuda() if not args.no_cuda else t for t in batch) input_ids, input_mask, segment_ids, label_ids = batch loss = model(input_ids, segment_ids, input_mask, label_ids) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.fp16 and args.loss_scale != 1.0: # rescale loss for fp16 training # see https://docs.nvidia.com/deeplearning/sdk/mixed-precision-training/index.html loss = loss * args.loss_scale if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps loss.backward() tr_loss += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16 or args.optimize_on_cpu: if args.fp16 and args.loss_scale != 1.0: # scale down gradients for fp16 training for param in model.parameters(): if param.grad is not None: param.grad.data = param.grad.data / args.loss_scale is_nan = set_optimizer_params_grad( param_optimizer, model.named_parameters(), test_nan=True) if is_nan: logger.info( "FP16 TRAINING: Nan in gradients, reducing loss scaling" ) args.loss_scale = args.loss_scale / 2 model.zero_grad() continue optimizer.step() copy_optimizer_params_to_model( model.named_parameters(), param_optimizer) else: optimizer.step() model.zero_grad() global_step += 1 if epoch > 0: logits_all, eval_accuracy = do_evaluation( model, eval_dataloader, args, is_training=False) if best_eval_acc < eval_accuracy: best_eval_acc = eval_accuracy best_step = global_step if args.do_prediction: predict_on_test(model, test_dataloader, args, epoch, best_eval_acc) print(best_eval_acc) # torch.save(model.state_dict(), os.path.join(args.output_dir, f"pytorch_model.bin")) print(best_eval_acc, args.seed, args.margin, args.do_margin_loss, args.learning_rate, args.bert_model, sys.argv[0]) result = f"best_eval_acc={best_eval_acc},args.seed={args.seed},args.do_margin_loss={args.do_margin_loss},args.margin={args.margin},best_step={best_step},train_batch_size={args.train_batch_size},eval_size={eval_size},script_name={sys.argv[0]},model={args.bert_model},learning_rate={args.learning_rate},num_train_epochs={args.num_train_epochs},max_seq_length={args.max_seq_length}" write_result_to_file(args, result)
from pytorch_pretrained_bert.tokenization import BertTokenizer from pytorch_pretrained_bert.modeling import BertForMultipleChoice, BertModel ############## from pytorch_pretrained_bert.file_utils import PYTORCH_PRETRAINED_BERT_CACHE import functools # In[2]: tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True) model_state_dict = torch.load("/home/ting/BERT-RACE/base_models/pytorch_model.bin") model = BertForMultipleChoice.from_pretrained('bert-base-uncased', # cache_dir=PYTORCH_PRETRAINED_BERT_CACHE / 'distributed_{}'.format(-1), state_dict=model_state_dict, num_choices=4) # In[3]: model.cuda().eval() # # API for calling when questions are put into Exam Keeper # In[4]: import time
def BertSwag(mode = 'eval', bert_model = "bert-base-uncased", data_dir = './SWAG_data'): parser = {} parser["data_dir"]=data_dir, parser["bert_model"]=bert_model, parser["output_dir"]=None, parser["max_seq_length"]=128, parser["do_train"] = (mode == 'train') parser["do_eval",] = (mode == 'eval') parser["do_lower_case"] = ('uncased' in bert_model) parser["train_batch_size"]=32, parser["eval_batch_size"]=8, parser["learning_rate"]=5e-5, parser["num_train_epochs"]=3.0, parser["warmup_proportion"]=0.1, parser["no_cuda"] = False parser["local_rank"] = -1 parser['seed'] = 42 parser['gradient_accumulation_steps') parser['fp16'] = False parser['loss_scale'] = 0 args = AttrDict.AttrDict(parser) if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logger.info("device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".format( device, n_gpu, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError("Invalid gradient_accumulation_steps parameter: {}, should be >= 1".format( args.gradient_accumulation_steps)) args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if not args.do_train and not args.do_eval: raise ValueError("At least one of `do_train` or `do_eval` must be True.") if os.path.exists(args.output_dir) and os.listdir(args.output_dir): raise ValueError("Output directory ({}) already exists and is not empty.".format(args.output_dir)) if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) # Prepare model model = BertForMultipleChoice.from_pretrained(args.bert_model, cache_dir=os.path.join(str(PYTORCH_PRETRAINED_BERT_CACHE), 'distributed_{}'.format(args.local_rank)), num_choices=4) if args.fp16: model.half() model.to(device) if args.local_rank != -1: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.") model = DDP(model) elif n_gpu > 1: model = torch.nn.DataParallel(model) if args.do_train: # Prepare data loader train_examples = read_swag_examples(os.path.join(args.data_dir, 'train.csv'), is_training = True) train_features = convert_examples_to_features( train_examples, tokenizer, args.max_seq_length, True) all_input_ids = torch.tensor(select_field(train_features, 'input_ids'), dtype=torch.long) all_input_mask = torch.tensor(select_field(train_features, 'input_mask'), dtype=torch.long) all_segment_ids = torch.tensor(select_field(train_features, 'segment_ids'), dtype=torch.long) all_label = torch.tensor([f.label for f in train_features], dtype=torch.long) train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label) if args.local_rank == -1: train_sampler = RandomSampler(train_data) else: train_sampler = DistributedSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) num_train_optimization_steps = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs if args.local_rank != -1: num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size() # Prepare optimizer param_optimizer = list(model.named_parameters()) # hack to remove pooler, which is not used # thus it produce None grad that break apex param_optimizer = [n for n in param_optimizer] no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [ {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01}, {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} ] if args.fp16: try: from apex.optimizers import FP16_Optimizer from apex.optimizers import FusedAdam except ImportError: raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.") optimizer = FusedAdam(optimizer_grouped_parameters, lr=args.learning_rate, bias_correction=False, max_grad_norm=1.0) if args.loss_scale == 0: optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) else: optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale) warmup_linear = WarmupLinearSchedule(warmup=args.warmup_proportion, t_total=num_train_optimization_steps) else: optimizer = BertAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=num_train_optimization_steps) global_step = 0 logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_examples)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_optimization_steps) model.train() for _ in trange(int(args.num_train_epochs), desc="Epoch"): tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration")): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, label_ids = batch loss = model(input_ids, segment_ids, input_mask, label_ids) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.fp16 and args.loss_scale != 1.0: # rescale loss for fp16 training # see https://docs.nvidia.com/deeplearning/sdk/mixed-precision-training/index.html loss = loss * args.loss_scale if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps tr_loss += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 if args.fp16: optimizer.backward(loss) else: loss.backward() if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16: # modify learning rate with special warm up BERT uses # if args.fp16 is False, BertAdam is used that handles this automatically lr_this_step = args.learning_rate * warmup_linear.get_lr(global_step, args.warmup_proportion) for param_group in optimizer.param_groups: param_group['lr'] = lr_this_step optimizer.step() optimizer.zero_grad() global_step += 1 if args.do_train: # Save a trained model, configuration and tokenizer model_to_save = model.module if hasattr(model, 'module') else model # Only save the model it-self # If we save using the predefined names, we can load using `from_pretrained` output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME) output_config_file = os.path.join(args.output_dir, CONFIG_NAME) torch.save(model_to_save.state_dict(), output_model_file) model_to_save.config.to_json_file(output_config_file) tokenizer.save_vocabulary(args.output_dir) # Load a trained model and vocabulary that you have fine-tuned model = BertForMultipleChoice.from_pretrained(args.output_dir, num_choices=4) tokenizer = BertTokenizer.from_pretrained(args.output_dir, do_lower_case=args.do_lower_case) else: model = BertForMultipleChoice.from_pretrained(args.bert_model, num_choices=4) model.to(device) if args.do_eval and (args.local_rank == -1 or torch.distributed.get_rank() == 0): eval_examples = read_swag_examples(os.path.join(args.data_dir, 'val.csv'), is_training = True) eval_features = convert_examples_to_features( eval_examples, tokenizer, args.max_seq_length, True) logger.info("***** Running evaluation *****") logger.info(" Num examples = %d", len(eval_examples)) logger.info(" Batch size = %d", args.eval_batch_size) all_input_ids = torch.tensor(select_field(eval_features, 'input_ids'), dtype=torch.long) all_input_mask = torch.tensor(select_field(eval_features, 'input_mask'), dtype=torch.long) all_segment_ids = torch.tensor(select_field(eval_features, 'segment_ids'), dtype=torch.long) all_label = torch.tensor([f.label for f in eval_features], dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label) # Run prediction for full data eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) model.eval() eval_loss, eval_accuracy = 0, 0 nb_eval_steps, nb_eval_examples = 0, 0 for input_ids, input_mask, segment_ids, label_ids in tqdm(eval_dataloader, desc="Evaluating"): input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) label_ids = label_ids.to(device) with torch.no_grad(): tmp_eval_loss = model(input_ids, segment_ids, input_mask, label_ids) logits = model(input_ids, segment_ids, input_mask) logits = logits.detach().cpu().numpy() label_ids = label_ids.to('cpu').numpy() tmp_eval_accuracy = accuracy(logits, label_ids) eval_loss += tmp_eval_loss.mean().item() eval_accuracy += tmp_eval_accuracy nb_eval_examples += input_ids.size(0) nb_eval_steps += 1 eval_loss = eval_loss / nb_eval_steps eval_accuracy = eval_accuracy / nb_eval_examples result = {'eval_loss': eval_loss, 'eval_accuracy': eval_accuracy, 'global_step': global_step, 'loss': tr_loss/global_step} output_eval_file = os.path.join(args.output_dir, "eval_results.txt") with open(output_eval_file, "w") as writer: logger.info("***** Eval results *****") for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key])))
def train(args, outdir): # Set GPU n_gpu = torch.cuda.device_count() logger.info("device: {} n_gpu: {}".format(args.device, n_gpu)) random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) # Create training directory if not os.path.exists(outdir): os.makedirs(outdir) # Model tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) model = BertForMultipleChoice.from_pretrained( args.bert_model, cache_dir=os.path.join(PYTORCH_PRETRAINED_BERT_CACHE, 'distributed_{}'.format(-1)), num_choices=4) model.to(args.device) if n_gpu > 1: model = torch.nn.DataParallel(model) # Load data train_examples = read_samples(os.path.join(args.data_dir, args.csvtrain)) num_train_optimization_steps = int( len(train_examples) / args.batch_size) * args.num_train_epochs train_features = convert_to_features(train_examples, tokenizer, args.max_seq_length) all_input_ids = torch.tensor(select_field(train_features, 'input_ids'), dtype=torch.long) all_input_mask = torch.tensor(select_field(train_features, 'input_mask'), dtype=torch.long) all_segment_ids = torch.tensor(select_field(train_features, 'segment_ids'), dtype=torch.long) all_label = torch.tensor([f.label for f in train_features], dtype=torch.long) train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label) train_sampler = RandomSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.batch_size) # Optimizer param_optimizer = list(model.named_parameters()) param_optimizer = [n for n in param_optimizer if 'pooler' not in n[0]] no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] optimizer = BertAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=num_train_optimization_steps) global_step = 0 logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_examples)) logger.info(" Batch size = %d", args.batch_size) logger.info(" Num steps = %d", num_train_optimization_steps) model.train() for _ in trange(int(args.num_train_epochs), desc="Epoch"): tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration")): batch = tuple(t.to(args.device) for t in batch) input_ids, input_mask, segment_ids, label_ids = batch loss = model(input_ids, segment_ids, input_mask, label_ids) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. tr_loss += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 loss.backward() optimizer.step() optimizer.zero_grad() global_step += 1 # Save a trained model and the associated configuration model_to_save = model.module if hasattr( model, 'module') else model # Only save the model it-self output_model_file = os.path.join(outdir, WEIGHTS_NAME) torch.save(model_to_save.state_dict(), output_model_file) output_config_file = os.path.join(outdir, CONFIG_NAME) with open(output_config_file, 'w') as f: f.write(model_to_save.config.to_json_string())
class FitBestSentence(): #FitSentence from run_swag import SwagExample, convert_examples_to_features, select_field, accuracy import csv import os import random import sys from io import open import numpy as np import torch from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler, TensorDataset) from torch.utils.data.distributed import DistributedSampler from tqdm import tqdm, trange from pytorch_pretrained_bert.file_utils import PYTORCH_PRETRAINED_BERT_CACHE from pytorch_pretrained_bert.modeling import BertForMultipleChoice from pytorch_pretrained_bert.optimization import BertAdam, warmup_linear from pytorch_pretrained_bert.tokenization import BertTokenizer # In[2]: tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True) model_state_dict = torch.load( "/home/ting/pytorch-pretrained-BERT/base_models/weights-bert-base-uncased" ) model = BertForMultipleChoice.from_pretrained('bert-base-uncased', state_dict=model_state_dict, num_choices=4) model.cuda().eval() # # API for calling when questions are put into Exam Keeper # In[3]: def feedexample(question): example = SwagExample(swag_id=question['id'], context_sentence=question['context_sentence'], start_ending=question['start_ending'], ending_0=question['ending_0'], ending_1=question['ending_1'], ending_2=question['ending_2'], ending_3=question['ending_3'], label=question['label']) return example def fit_the_best_sentence_api(questions): eval_examples = [feedexample(question) for question in questions] eval_features = convert_examples_to_features(eval_examples, tokenizer, 512, True) all_input_ids = torch.tensor(select_field(eval_features, 'input_ids'), dtype=torch.long) all_input_mask = torch.tensor(select_field(eval_features, 'input_mask'), dtype=torch.long) all_segment_ids = torch.tensor(select_field(eval_features, 'segment_ids'), dtype=torch.long) all_label = torch.tensor([f.label for f in eval_features], dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label) # Run prediction for full data eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=8) output_probability = torch.zeros((len(eval_features), 4)) for step, batch in enumerate(eval_dataloader): batch = tuple(t.cuda() for t in batch) input_ids, input_mask, segment_ids, label_ids = batch with torch.no_grad(): tmp_eval_loss = model(input_ids, segment_ids, input_mask, label_ids) logits = model(input_ids, segment_ids, input_mask) logits = logits.detach().cpu() output_probability[step * 8:(step + 1) * 8] = torch.softmax( logits, 1) return output_probability
def get_scores(args, split='test'): if split == 'train': input_file = os.path.join(args.data_dir, args.csvtrain) filescores = os.path.join( args.data_dir, 'PriorScores/priorscores_answers_train.pckl') elif split == 'val': input_file = os.path.join(args.data_dir, args.csvval) filescores = os.path.join(args.data_dir, 'PriorScores/priorscores_answers_val.pckl') elif split == 'test': input_file = os.path.join(args.data_dir, args.csvtest) filescores = os.path.join(args.data_dir, 'PriorScores/priorscores_answers_test.pckl') if os.path.exists(filescores): return # Load Model tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) output_model_file = os.path.join(outdir, WEIGHTS_NAME) output_config_file = os.path.join(outdir, CONFIG_NAME) config = BertConfig(output_config_file) model = BertForMultipleChoice(config, num_choices=4) model.load_state_dict(torch.load(output_model_file)) model.to(args.device) n_gpu = torch.cuda.device_count() logger.info("device: {} n_gpu: {}".format(args.device, n_gpu)) if n_gpu > 1: model = torch.nn.DataParallel(model) # Data eval_examples = read_samples(input_file) eval_features = convert_to_features(eval_examples, tokenizer, args.max_seq_length) all_input_ids = torch.tensor(select_field(eval_features, 'input_ids'), dtype=torch.long) all_input_mask = torch.tensor(select_field(eval_features, 'input_mask'), dtype=torch.long) all_segment_ids = torch.tensor(select_field(eval_features, 'segment_ids'), dtype=torch.long) all_labels = torch.tensor([example.label for example in eval_examples], dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_labels) eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) # Run prediction logger.info("***** Compute prior scores *****") logger.info("Num examples = %d", len(eval_examples)) logger.info("Batch size = %d", args.eval_batch_size) model.eval() batch_idx = 0 for _, batch in enumerate(tqdm(eval_dataloader, desc="Iteration")): input_ids, input_mask, segment_ids, truelabel = batch input_ids = input_ids.to(args.device) input_mask = input_mask.to(args.device) segment_ids = segment_ids.to(args.device) with torch.no_grad(): logits = model(input_ids, segment_ids, input_mask) logits = nn.functional.softmax(logits) logits = logits.detach().cpu().numpy() if batch_idx == 0: scores = logits else: scores = np.concatenate((scores, logits), axis=0) batch_idx += 1 if not os.path.exists(os.path.dirname(filescores)): os.mkdir(os.path.dirname(filescores)) utils.save_obj(scores, filescores) logger.info('Prior scores for %s saved into %s' % (split, filescores))
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument( "--data_dir", default=None, type=str, required=True, help= "The input data dir. Should contain the .csv files (or other data files) for the task." ) parser.add_argument( "--bert_model", default=None, type=str, required=True, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, " "bert-base-multilingual-cased, bert-base-chinese.") parser.add_argument( "--output_dir", default=None, type=str, required=True, help="The output directory where the model checkpoints will be written." ) ## Other parameters parser.add_argument( "--max_seq_length", default=128, type=int, help= "The maximum total input sequence length after WordPiece tokenization. \n" "Sequences longer than this will be truncated, and sequences shorter \n" "than this will be padded.") parser.add_argument("--do_train", action='store_true', help="Whether to run training.") parser.add_argument("--do_eval", action='store_true', help="Whether to run eval on the dev set.") parser.add_argument( "--do_lower_case", action='store_true', help="Set this flag if you are using an uncased model.") parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.") parser.add_argument("--eval_batch_size", default=8, type=int, help="Total batch size for eval.") parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument( "--warmup_proportion", default=0.1, type=float, help= "Proportion of training to perform linear learning rate warmup for. " "E.g., 0.1 = 10%% of training.") parser.add_argument("--no_cuda", action='store_true', help="Whether not to use CUDA when available") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument( '--gradient_accumulation_steps', type=int, default=1, help= "Number of updates steps to accumulate before performing a backward/update pass." ) parser.add_argument( '--fp16', action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument( '--loss_scale', type=float, default=0, help= "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.\n") args = parser.parse_args() if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logger.info( "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}". format(device, n_gpu, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(args.gradient_accumulation_steps)) args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if not args.do_train and not args.do_eval: raise ValueError( "At least one of `do_train` or `do_eval` must be True.") # if os.path.exists(args.output_dir) and os.listdir(args.output_dir): # raise ValueError("Output directory ({}) already exists and is not empty.".format(args.output_dir)) if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) train_examples = None num_train_optimization_steps = None if args.do_train: train_examples = read_swag_examples(os.path.join( args.data_dir, 'train.csv'), is_training=True) num_train_optimization_steps = int( len(train_examples) / args.train_batch_size / args.gradient_accumulation_steps) * args.num_train_epochs if args.local_rank != -1: num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size( ) # Prepare model cache_dir = os.path.join(PYTORCH_PRETRAINED_BERT_CACHE, 'distributed_{}'.format(args.local_rank)) predictor = BertForMultipleChoice.from_pretrained(args.bert_model, cache_dir=cache_dir, num_choices=4) adversary = BertForSequenceClassification.from_pretrained( args.bert_model, cache_dir=cache_dir, num_labels=3) if args.fp16: predictor.half() adversary.half() predictor.to(device) adversary.to(device) if args.local_rank != -1: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) predictor = DDP(predictor) adversary = DDP(adversary) elif n_gpu > 1: predictor = torch.nn.DataParallel(predictor) adversary = torch.nn.DataParallel(adversary) # Prepare optimizer param_optimizer_pred = list(predictor.named_parameters()) param_optimizer_adv = list(adversary.named_parameters()) # hack to remove pooler, which is not used # thus it produce None grad that break apex param_optimizer_pred = [ n for n in param_optimizer_pred if 'pooler' not in n[0] ] param_optimizer_adv = [ n for n in param_optimizer_adv if 'pooler' not in n[0] ] no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters_pred = [{ 'params': [ p for n, p in param_optimizer_pred if not any(nd in n for nd in no_decay) ], 'weight_decay': 0.01 }, { 'params': [ p for n, p in param_optimizer_pred if any(nd in n for nd in no_decay) ], 'weight_decay': 0.0 }] optimizer_grouped_parameters_adv = [{ 'params': [ p for n, p in param_optimizer_adv if not any(nd in n for nd in no_decay) ], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer_adv if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] if args.fp16: try: from apex.optimizers import FP16_Optimizer from apex.optimizers import FusedAdam except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) optimizer_pred = FusedAdam(optimizer_grouped_parameters_pred, lr=args.learning_rate, bias_correction=False, max_grad_norm=1.0) optimizer_adv = FusedAdam(optimizer_grouped_parameters_adv, lr=args.learning_rate, bias_correction=False, max_grad_norm=1.0) if args.loss_scale == 0: optimizer_pred = FP16_Optimizer(optimizer_pred, dynamic_loss_scale=True) optimizer_adv = FP16_Optimizer(optimizer_adv, dynamic_loss_scale=True) else: optimizer_pred = FP16_Optimizer(optimizer_pred, static_loss_scale=args.loss_scale) optimizer_adv = FP16_Optimizer(optimizer_adv, static_loss_scale=args.loss_scale) else: optimizer_pred = BertAdam(optimizer_grouped_parameters_pred, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=num_train_optimization_steps) optimizer_adv = BertAdam(optimizer_grouped_parameters_adv, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=num_train_optimization_steps) alpha = 1 global_step = 0 if args.do_train: train_features = convert_examples_to_features(train_examples, tokenizer, args.max_seq_length, True) logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_examples)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_optimization_steps) all_input_ids = torch.tensor(select_field(train_features, 'input_ids'), dtype=torch.long) all_input_mask = torch.tensor(select_field(train_features, 'input_mask'), dtype=torch.long) all_segment_ids = torch.tensor(select_field(train_features, 'segment_ids'), dtype=torch.long) all_label = torch.tensor([f.label for f in train_features], dtype=torch.long) all_vp_input_ids = torch.tensor(select_field(train_features, 'vp_input_ids'), dtype=torch.long) all_vp_input_mask = torch.tensor(select_field(train_features, 'vp_input_mask'), dtype=torch.long) all_protected_attr = torch.tensor( [f.protected_attr for f in train_features], dtype=torch.long) train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label, all_vp_input_ids, all_vp_input_mask, all_protected_attr) if args.local_rank == -1: train_sampler = RandomSampler(train_data) else: train_sampler = DistributedSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) training_history = [] # predictor.train() adversary.train() for _ in trange(int(args.num_train_epochs), desc="Epoch"): tr_loss_pred, tr_loss_adv = 0, 0 nb_tr_examples, nb_tr_steps = 0, 0 for step, batch in enumerate( tqdm(train_dataloader, desc="Iteration")): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, label_ids, vp_input_ids, vp_input_mask, protected_attr_ids = batch # loss_pred, logits = predictor(input_ids, segment_ids, input_mask, label_ids) # predicted_vps = torch.argmax(logits, dim=1) # print("predicted_vps: ", predicted_vps) # predicted_vps = predicted_vps.view(-1, 1).repeat(1, vp_input_ids.size(2)).view([-1, 1, vp_input_ids.size(2)]) # vp_input_ids = torch.gather(vp_input_ids, dim=1, index=predicted_vps) # vp_input_ids = vp_input_ids.view([vp_input_ids.size(0), -1]) # vp_input_mask = torch.gather(vp_input_mask, dim=1, index=predicted_vps) # vp_input_mask = vp_input_mask.view([vp_input_mask.size(0), -1]) vp_input_ids = vp_input_ids.view( vp_input_ids.shape[0] * vp_input_ids.shape[1], vp_input_ids.shape[2]) vp_input_mask = vp_input_mask.view( vp_input_mask.shape[0] * vp_input_mask.shape[1], vp_input_mask.shape[2]) vp_input_ids = torch.cat([ 100 * torch.ones(vp_input_ids.shape[0], 1).long().to(device), vp_input_ids ], dim=1) vp_input_mask = torch.cat([ torch.ones(vp_input_ids.shape[0], 1).long().to(device), vp_input_mask ], dim=1) #protected_attr_ids = protected_attr_ids.t() protected_attr_ids = protected_attr_ids.repeat(4, 1).t() protected_attr_ids = protected_attr_ids.reshape( protected_attr_ids.shape[0] * protected_attr_ids.shape[1]) # print(vp_input_ids.shape, vp_input_mask.shape, protected_attr_ids.shape) # print(protected_attr_ids.detach().to('cpu').numpy()) loss_adv, _ = adversary(vp_input_ids, None, vp_input_mask, protected_attr_ids) if n_gpu > 1: # loss_pred = loss_pred.mean() # mean() to average on multi-gpu. loss_adv = loss_adv.mean() if args.fp16 and args.loss_scale != 1.0: # rescale loss for fp16 training # see https://docs.nvidia.com/deeplearning/sdk/mixed-precision-training/index.html # loss_pred = loss_pred * args.loss_scale loss_adv = loss_adv * args.loss_scale if args.gradient_accumulation_steps > 1: # loss_pred = loss_pred / args.gradient_accumulation_steps loss_adv = loss_adv / args.gradient_accumulation_steps # tr_loss_pred += loss_pred.item() tr_loss_adv += loss_adv.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 training_history.append([loss_adv.item()]) # loss = loss_pred - alpha * loss_adv # if args.fp16: # optimizer_pred.backward(loss) # else: # loss.backward(retain_graph=True) # if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16: # modify learning rate with special warm up BERT uses # if args.fp16 is False, BertAdam is used that handles this automatically lr_this_step = args.learning_rate * warmup_linear( global_step / num_train_optimization_steps, args.warmup_proportion) for param_group in optimizer_pred.param_groups: param_group['lr'] = lr_this_step for param_group in optimizer_adv.param_groups: param_group['lr'] = lr_this_step # optimizer_pred.step() # optimizer_pred.zero_grad() # optimizer_adv.zero_grad() if args.fp16: optimizer_adv.backward(loss_adv) else: loss_adv.backward() optimizer_adv.step() optimizer_adv.zero_grad() global_step += 1 history_file = open(os.path.join(args.output_dir, "train_results.csv"), "w") writer = csv.writer(history_file, delimiter=",") writer.writerow(["adv_loss"]) for row in training_history: writer.writerow(row) if args.do_train: # Save a trained model and the associated configuration # model_to_save = predictor.module if hasattr(predictor, 'module') else predictor # Only save the model it-self WEIGHTS_NAME = 'weights.pt' CONFIG_NAME = 'config.json' # output_model_file = os.path.join(args.output_dir, 'predictor_' + WEIGHTS_NAME) # torch.save(model_to_save.state_dict(), output_model_file) # output_config_file = os.path.join(args.output_dir, 'predictor_' + CONFIG_NAME) # with open(output_config_file, 'w') as f: # f.write(model_to_save.config.to_json_string()) # Load a trained model and config that you have fine-tuned # config = BertConfig(output_config_file) # predictor = BertForMultipleChoice(config, num_choices=4) # predictor.load_state_dict(torch.load(output_model_file)) # Do the same for adversary model_to_save = adversary.module if hasattr( adversary, 'module') else adversary # Only save the model it-self output_model_file = os.path.join(args.output_dir, 'adversary_' + WEIGHTS_NAME) torch.save(model_to_save.state_dict(), output_model_file) output_config_file = os.path.join(args.output_dir, 'adversary_' + CONFIG_NAME) with open(output_config_file, 'w') as f: f.write(model_to_save.config.to_json_string()) config = BertConfig(output_config_file) adversary = BertForSequenceClassification(config, num_labels=3) adversary.load_state_dict(torch.load(output_model_file)) else: WEIGHTS_NAME = 'weights.pt' CONFIG_NAME = 'config.json' # output_model_file = os.path.join(args.output_dir, 'predictor_' + WEIGHTS_NAME) # output_config_file = os.path.join(args.output_dir, 'predictor_' + CONFIG_NAME) # config = BertConfig(output_config_file) # predictor = BertForMultipleChoice(config, num_choices=4) # predictor.load_state_dict(torch.load(output_model_file)) output_model_file = os.path.join(args.output_dir, 'adversary_' + WEIGHTS_NAME) output_config_file = os.path.join(args.output_dir, 'adversary_' + CONFIG_NAME) config = BertConfig(output_config_file) adversary = BertForSequenceClassification(config, num_labels=3) adversary.load_state_dict(torch.load(output_model_file)) # predictor = BertForMultipleChoice.from_pretrained(args.bert_model, num_choices=4) # adversary = BertForSequenceClassification.from_pretrained(args.bert_model, num_labels=3) # predictor.to(device) adversary.to(device) if args.do_eval and (args.local_rank == -1 or torch.distributed.get_rank() == 0): eval_examples = read_swag_examples(os.path.join( args.data_dir, 'val.csv'), is_training=True) eval_features = convert_examples_to_features(eval_examples, tokenizer, args.max_seq_length, True) logger.info("***** Running evaluation *****") logger.info(" Num examples = %d", len(eval_examples)) logger.info(" Batch size = %d", args.eval_batch_size) all_input_ids = torch.tensor(select_field(eval_features, 'input_ids'), dtype=torch.long) all_input_mask = torch.tensor(select_field(eval_features, 'input_mask'), dtype=torch.long) all_segment_ids = torch.tensor(select_field(eval_features, 'segment_ids'), dtype=torch.long) all_label = torch.tensor([f.label for f in eval_features], dtype=torch.long) all_vp_input_ids = torch.tensor(select_field(eval_features, 'vp_input_ids'), dtype=torch.long) all_vp_input_mask = torch.tensor(select_field(eval_features, 'vp_input_mask'), dtype=torch.long) all_protected_attr = torch.tensor( [f.protected_attr for f in eval_features], dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label, all_vp_input_ids, all_vp_input_mask, all_protected_attr) # Run prediction for full data eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) # predictor.eval() adversary.eval() eval_loss_pred, eval_accuracy_pred = 0, 0 eval_loss_adv, eval_accuracy_adv = 0, 0 nb_eval_steps, nb_eval_examples = 0, 0 for input_ids, input_mask, segment_ids, label_ids, vp_input_ids, vp_input_mask, protected_attr_ids in eval_dataloader: input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) label_ids = label_ids.to(device) vp_input_ids = vp_input_ids.to(device) vp_input_mask = vp_input_mask.to(device) protected_attr_ids = protected_attr_ids.to(device) # with torch.no_grad(): # tmp_eval_loss_pred, logits_pred = predictor(input_ids, segment_ids, input_mask, label_ids) # predicted_vps = torch.argmax(logits_pred, dim=1) # predicted_vps = predicted_vps.view(-1, 1).repeat(1, vp_input_ids.size(2)).view([-1, 1, vp_input_ids.size(2)]) # vp_input_ids = torch.gather(vp_input_ids, dim=1, index=predicted_vps) # vp_input_ids = vp_input_ids.view([vp_input_ids.size(0), -1]) # vp_input_mask = torch.gather(vp_input_mask, dim=1, index=predicted_vps) # vp_input_mask = vp_input_mask.view([vp_input_mask.size(0), -1]) vp_input_ids = vp_input_ids.view( vp_input_ids.shape[0] * vp_input_ids.shape[1], vp_input_ids.shape[2]) vp_input_mask = vp_input_mask.view( vp_input_mask.shape[0] * vp_input_mask.shape[1], vp_input_mask.shape[2]) vp_input_ids = torch.cat([ 100 * torch.ones(vp_input_ids.shape[0], 1).long().to(device), vp_input_ids ], dim=1) vp_input_mask = torch.cat([ torch.ones(vp_input_ids.shape[0], 1).long().to(device), vp_input_mask ], dim=1) #protected_attr_ids = protected_attr_ids.t() protected_attr_ids = protected_attr_ids.repeat(4, 1).t() protected_attr_ids = protected_attr_ids.reshape( protected_attr_ids.shape[0] * protected_attr_ids.shape[1]) with torch.no_grad(): tmp_eval_loss_adv, logits_adv = adversary( vp_input_ids, None, vp_input_mask, protected_attr_ids) # print("logits_adv", logits_adv) # tmp_eval_accuracy_pred = accuracy(logits_pred, label_ids) print("logits_adv shape ", logits_adv.shape) print("protected_attr_ids shape ", protected_attr_ids.shape) tmp_eval_accuracy_adv = accuracy(logits_adv, protected_attr_ids) # eval_loss_pred += tmp_eval_loss_pred.mean().item() # eval_accuracy_pred += tmp_eval_accuracy_pred.item() eval_loss_adv += tmp_eval_loss_adv.mean().item() eval_accuracy_adv += tmp_eval_accuracy_adv.item() nb_eval_examples += input_ids.size(0) * 4 nb_eval_steps += 1 print("eval_accuracy_adv", eval_accuracy_adv) print("nb_eval_examples", nb_eval_examples) # eval_loss_pred /= nb_eval_steps # eval_accuracy_pred /= nb_eval_examples eval_loss_adv /= nb_eval_steps eval_accuracy_adv /= nb_eval_examples print("========================================================") print("eval_accuracy_adv", eval_accuracy_adv) print("nb_eval_examples", nb_eval_examples) if args.do_train: result = { # 'eval_loss_pred': eval_loss_pred, # 'eval_accuracy_pred': eval_accuracy_pred, 'eval_loss_adv': eval_loss_adv, 'eval_accuracy_adv': eval_accuracy_adv, 'global_step': global_step, # 'loss_pred': tr_loss_pred/nb_tr_steps, 'loss_adv': tr_loss_adv / nb_tr_steps } else: result = { # 'eval_loss_pred': eval_loss_pred, # 'eval_accuracy_pred': eval_accuracy_pred, 'eval_loss_adv': eval_loss_adv, 'eval_accuracy_adv': eval_accuracy_adv } output_eval_file = os.path.join(args.output_dir, "eval_results.txt") with open(output_eval_file, "w") as writer: logger.info("***** Eval results *****") for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key])))
logger.info("***** Running evaluation *****") logger.info(" Num examples = %d", len(eval_examples)) logger.info(" Batch size = %d", args.eval_batch_size) all_input_ids = torch.tensor(select_field(eval_features, 'input_ids'), dtype=torch.long) all_input_mask = torch.tensor(select_field(eval_features, 'input_mask'), dtype=torch.long) all_segment_ids = torch.tensor(select_field(eval_features, 'segment_ids'), dtype=torch.long) all_label = torch.tensor([f.label for f in eval_features], dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label) # Run prediction for full data eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) # Prepare model model = BertForMultipleChoice.from_pretrained(args.bert_model, cache_dir=PYTORCH_PRETRAINED_BERT_CACHE, num_choices=4) model.to(device) #Freeze the embedding network and encoder layers print("Freeze network") for name, param in model.named_parameters(): ln = 24 if name.startswith('bert.encoder'): l = name.split('.') ln = int(l[3]) if name.startswith('bert.embeddings') or ln < 6: print(name) param.requires_grad = False
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument("--data_dir", default=None, type=str, required=True, help="The input data dir. Should contain the .csv files (or other data files) for the task.") parser.add_argument("--bert_model", default=None, type=str, required=True, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, " "bert-base-multilingual-cased, bert-base-chinese.") parser.add_argument("--output_dir", default=None, type=str, required=True, help="The output directory where the model checkpoints will be written.") ## Other parameters parser.add_argument("--max_seq_length", default=512, type=int, help="The maximum total input sequence length after WordPiece tokenization. \n" "Sequences longer than this will be truncated, and sequences shorter \n" "than this will be padded.") parser.add_argument("--do_lower_case", action='store_true', help="Set this flag if you are using an uncased model.") parser.add_argument("--train_batch_size", default=1, type=int, help="Total batch size for training.") parser.add_argument("--eval_batch_size", default=1, type=int, help="Total batch size for eval.") parser.add_argument("--n_classes", default=3, type=int, help="Number of classes to pass to BERT. (2 or 3)") parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--num_train_epochs", default=1000, type=float, help="Total number of training epochs to perform.") parser.add_argument("--warmup_proportion", default=0.1, type=float, help="Proportion of training to perform linear learning rate warmup for. " "E.g., 0.1 = 10%% of training.") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") args = parser.parse_args() device = torch.device("cuda" if torch.cuda.is_available() else "cpu") n_gpu = torch.cuda.device_count() random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) # Output directory if os.path.exists(args.output_dir) and os.listdir(args.output_dir): raise ValueError("Output directory ({}) already exists and is not empty.".format(args.output_dir)) if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) train_examples = read_GAP_examples(os.path.join(args.data_dir, 'gap-test.tsv'), is_training=True, n_classes=args.n_classes) eval_examples = read_GAP_examples(os.path.join(args.data_dir, 'gap-validation.tsv'), is_training=True, n_classes=args.n_classes) num_train_optimization_steps = int(len(train_examples) / args.train_batch_size) * args.num_train_epochs # Prepare model if args.n_classes == 3: model = BertForMultipleChoice.from_pretrained(args.bert_model, cache_dir=os.path.join(str(PYTORCH_PRETRAINED_BERT_CACHE), 'distributed_-1'), num_choices=3) else: model = BertForTwoChoices.from_pretrained(args.bert_model, cache_dir=os.path.join(str(PYTORCH_PRETRAINED_BERT_CACHE), 'distributed_-1'), num_choices_in=2, num_choices_out=3) model.to(device) if n_gpu > 1: model = torch.nn.DataParallel(model) # Prepare optimizer param_optimizer = list(model.named_parameters()) # hack to remove pooler, which is not used param_optimizer = [n for n in param_optimizer if 'pooler' not in n[0]] no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [ {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01}, {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} ] optimizer = BertAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=num_train_optimization_steps) global_step = 0 train_features = convert_examples_to_features(train_examples, tokenizer, args.max_seq_length, True) all_input_ids = torch.tensor(select_field(train_features, 'input_ids'), dtype=torch.long) all_input_mask = torch.tensor(select_field(train_features, 'input_mask'), dtype=torch.long) all_segment_ids = torch.tensor(select_field(train_features, 'segment_ids'), dtype=torch.long) all_label = torch.tensor([f.label for f in train_features], dtype=torch.long) train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label) train_sampler = RandomSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) eval_features = convert_examples_to_features(eval_examples, tokenizer, args.max_seq_length, True) all_input_ids = torch.tensor(select_field(eval_features, 'input_ids'), dtype=torch.long) all_input_mask = torch.tensor(select_field(eval_features, 'input_mask'), dtype=torch.long) all_segment_ids = torch.tensor(select_field(eval_features, 'segment_ids'), dtype=torch.long) all_label = torch.tensor([f.label for f in eval_features], dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label) # Training and validation loop logger.info(" Num examples = %d", len(train_examples)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_optimization_steps) # For early stopping... best_accuracy = 0. patience = 0 for epoch in trange(int(args.num_train_epochs), desc="Epoch"): model.train() logger.info("\n***** Running training *****") tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration")): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, label_ids = batch loss = model(input_ids, segment_ids, input_mask, label_ids) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. tr_loss += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 loss.backward() optimizer.step() optimizer.zero_grad() global_step += 1 print('Training loss {}'.format(tr_loss/step)) logger.info("\n***** Running evaluation *****") logger.info(" Num examples = %d", len(eval_examples)) logger.info(" Batch size = %d", args.eval_batch_size) eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) model.eval() eval_loss, eval_accuracy = 0, 0 nb_eval_steps, nb_eval_examples = 0, 0 for input_ids, input_mask, segment_ids, label_ids in tqdm(eval_dataloader, desc="Evaluating"): input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) label_ids = label_ids.to(device) with torch.no_grad(): tmp_eval_loss = model(input_ids, segment_ids, input_mask, label_ids) logits = model(input_ids, segment_ids, input_mask) logits = logits.detach().cpu().numpy() label_ids = label_ids.to('cpu').numpy() tmp_eval_accuracy = accuracy(logits, label_ids) eval_loss += tmp_eval_loss.mean().item() eval_accuracy += tmp_eval_accuracy nb_eval_examples += input_ids.size(0) nb_eval_steps += 1 eval_loss = eval_loss / nb_eval_steps eval_accuracy = eval_accuracy / nb_eval_examples result = {'eval_loss': eval_loss, 'eval_accuracy': eval_accuracy, 'global_step': global_step, 'loss': tr_loss/nb_tr_steps} if eval_accuracy >= best_accuracy: output_eval_file = os.path.join(args.output_dir, "eval_results.txt") with open(output_eval_file, "w") as writer: logger.info("***** Eval results *****") for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) # Save a trained model model_to_save = model.module if hasattr(model, 'module') else model output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME) torch.save(model_to_save.state_dict(), output_model_file) best_accuracy = eval_accuracy patience = 0 elif patience < 4: patience += 1 logger.info("Patience {}".format(patience)) else: logger.info("***** Early Stopping *****") logger.info("Best eval accuracy: {}".format(best_accuracy)) logger.info("Epoch: {}".format(epoch)) break # Save the config output_config_file = os.path.join(args.output_dir, CONFIG_NAME) with open(output_config_file, 'w') as f: f.write(model.module.config.to_json_string())
def main(): parser = argparse.ArgumentParser() #drive.mount('/content/gdrive') swagDir = './' cacheDir = './cache/' saveDir = './save/cache/' ## Required parameters parser.add_argument("--data_dir", default=swagDir, type=str, #required=True, help="The input data dir. Should contain the .csv files (or other data files) for the task.") parser.add_argument("--bert_model", default="bert-base-uncased", type=str, #required=True, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, " "bert-base-multilingual-cased, bert-base-chinese.") parser.add_argument("--task_name", # "mnli" 或 "mrpc" default=None, type=str, required=True, help="The name of the task to train.") parser.add_argument("--output_dir", default=saveDir, type=str, #required=True, help="The output directory where the model checkpoints will be written.") ## Other parameters parser.add_argument("--cache_dir", default="", type=str, help="Where do you want to store the pre-trained models downloaded from s3") parser.add_argument("--max_seq_length", default=100, type=int, help="The maximum total input sequence length after WordPiece tokenization. \n" "Sequences longer than this will be truncated, and sequences shorter \n" "than this will be padded.") parser.add_argument("--do_train", default = True, action='store_true', help="Whether to run training.") parser.add_argument("--do_eval", default = True, action='store_true', help="Whether to run eval on the dev set.") parser.add_argument("--do_lower_case", action='store_true', help="Set this flag if you are using an uncased model.") parser.add_argument("--train_batch_size", default=8, type=int, help="Total batch size for training.") parser.add_argument("--eval_batch_size", default=8, type=int, help="Total batch size for eval.") parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument("--warmup_proportion", default=0.1, type=float, help="Proportion of training to perform linear learning rate warmup for. " "E.g., 0.1 = 10%% of training.") parser.add_argument("--no_cuda", action='store_true', help="Whether not to use CUDA when available") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument('--gradient_accumulation_steps', type=int, default=1, help="Number of updates steps to accumulate before performing a backward/update pass.") parser.add_argument('--fp16', action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument('--loss_scale', type=float, default=0, help="Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.\n") args = parser.parse_args() device = torch.device("cuda") n_gpu = torch.cuda.device_count() processors = { "cola": ColaProcessor, "mnli": MnliProcessor, "mnli-mm": MnliMismatchedProcessor, "mrpc": MrpcProcessor, "sst-2": Sst2Processor, "sts-b": StsbProcessor, "qqp": QqpProcessor, "qnli": QnliProcessor, "rte": RteProcessor, "wnli": WnliProcessor, } output_modes = { "cola": "classification", "mnli": "classification", "mrpc": "classification", "sst-2": "classification", "sts-b": "regression", "qqp": "classification", "qnli": "classification", "rte": "classification", "wnli": "classification", } task_name = args.task_name.lower() if task_name not in processors: raise ValueError("Task not found: %s" % (task_name)) processor = processors[task_name]() output_mode = output_modes[task_name] label_list = processor.get_labels() num_labels = len(label_list) """ if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') """ logger.info("device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".format( device, n_gpu, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError("Invalid gradient_accumulation_steps parameter: {}, should be >= 1".format( args.gradient_accumulation_steps)) args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if not args.do_train and not args.do_eval: raise ValueError("At least one of `do_train` or `do_eval` must be True.") if os.path.exists(args.output_dir) and os.listdir(args.output_dir): raise ValueError("Output directory ({}) already exists and is not empty.".format(args.output_dir)) if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) cache_dir = args.cache_dir if args.cache_dir else os.path.join(str(PYTORCH_PRETRAINED_BERT_CACHE), 'distributed_{}'.format(args.local_rank)) modelinit = BertForSequenceClassification.from_pretrained(args.bert_model, cache_dir=cache_dir, num_labels=num_labels) if args.fp16: modelinit.half() modelinit.to(device) modelinit.load_state_dict(torch.load('./cache/pytorch_model.bin')) """ if args.local_rank != -1: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.") modelinit = DDP(modelinit) elif n_gpu > 1: modelinit = torch.nn.DataParallel(modelinit) """ train_examples = None num_train_optimization_steps = None if args.do_train: train_examples = read_swag_examples(os.path.join(args.data_dir, 'train.csv'), is_training = True) num_train_optimization_steps = int( len(train_examples) / args.train_batch_size / args.gradient_accumulation_steps) * args.num_train_epochs if args.local_rank != -1: num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size() # Prepare model model = BertForMultipleChoice.from_pretrained(args.bert_model, cache_dir=os.path.join(str(PYTORCH_PRETRAINED_BERT_CACHE), 'distributed_{}'.format(args.local_rank)), num_choices=4) if args.fp16: model.half() model.to(device) model.bert = modelinit.bert model.to(device) if args.local_rank != -1: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.") model = DDP(model) elif n_gpu > 1: model = torch.nn.DataParallel(model) # Prepare optimizer param_optimizer = list(model.named_parameters()) # hack to remove pooler, which is not used # thus it produce None grad that break apex param_optimizer = [n for n in param_optimizer if 'pooler' not in n[0]] no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [ {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01}, {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} ] if args.fp16: try: from apex.optimizers import FP16_Optimizer from apex.optimizers import FusedAdam except ImportError: raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.") optimizer = FusedAdam(optimizer_grouped_parameters, lr=args.learning_rate, bias_correction=False, max_grad_norm=1.0) if args.loss_scale == 0: optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) else: optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale) else: optimizer = BertAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=num_train_optimization_steps) global_step = 0 if args.do_train: train_features = convert_examples_to_features( train_examples, tokenizer, args.max_seq_length, True) logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_examples)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_optimization_steps) all_input_ids = torch.tensor(select_field(train_features, 'input_ids'), dtype=torch.long) all_input_mask = torch.tensor(select_field(train_features, 'input_mask'), dtype=torch.long) all_segment_ids = torch.tensor(select_field(train_features, 'segment_ids'), dtype=torch.long) all_label = torch.tensor([f.label for f in train_features], dtype=torch.long) train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label) if args.local_rank == -1: train_sampler = RandomSampler(train_data) else: train_sampler = DistributedSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) model.train() for _ in trange(int(args.num_train_epochs), desc="Epoch"): tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 #for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration")): for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration")): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, label_ids = batch loss = model(input_ids, segment_ids, input_mask, label_ids) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.fp16 and args.loss_scale != 1.0: # rescale loss for fp16 training # see https://docs.nvidia.com/deeplearning/sdk/mixed-precision-training/index.html loss = loss * args.loss_scale if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps tr_loss += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 if args.fp16: optimizer.backward(loss) else: loss.backward() if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16: # modify learning rate with special warm up BERT uses # if args.fp16 is False, BertAdam is used that handles this automatically #lr_this_step = args.learning_rate * warmup_linear(global_step/num_train_optimization_steps, args.warmup_proportion) lr_this_step = args.learning_rate for param_group in optimizer.param_groups: param_group['lr'] = lr_this_step optimizer.step() optimizer.zero_grad() global_step += 1 if args.do_train: # Save a trained model and the associated configuration model_to_save = model.module if hasattr(model, 'module') else model # Only save the model it-self output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME) torch.save(model_to_save.state_dict(), output_model_file) output_config_file = os.path.join(args.output_dir, CONFIG_NAME) with open(output_config_file, 'w') as f: f.write(model_to_save.config.to_json_string()) # Load a trained model and config that you have fine-tuned config = BertConfig(output_config_file) model = BertForMultipleChoice(config, num_choices=4) model.load_state_dict(torch.load(output_model_file)) else: model = BertForMultipleChoice.from_pretrained(args.bert_model, num_choices=4) model.to(device) if args.do_eval and (args.local_rank == -1 or torch.distributed.get_rank() == 0): eval_examples = read_swag_examples(os.path.join(args.data_dir, 'val.csv'), is_training = True) eval_features = convert_examples_to_features( eval_examples, tokenizer, args.max_seq_length, True) logger.info("***** Running evaluation *****") logger.info(" Num examples = %d", len(eval_examples)) logger.info(" Batch size = %d", args.eval_batch_size) all_input_ids = torch.tensor(select_field(eval_features, 'input_ids'), dtype=torch.long) all_input_mask = torch.tensor(select_field(eval_features, 'input_mask'), dtype=torch.long) all_segment_ids = torch.tensor(select_field(eval_features, 'segment_ids'), dtype=torch.long) all_label = torch.tensor([f.label for f in eval_features], dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label) # Run prediction for full data eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) model.eval() eval_loss, eval_accuracy = 0, 0 nb_eval_steps, nb_eval_examples = 0, 0 for input_ids, input_mask, segment_ids, label_ids in tqdm(eval_dataloader, desc="Evaluating"): input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) label_ids = label_ids.to(device) with torch.no_grad(): tmp_eval_loss = model(input_ids, segment_ids, input_mask, label_ids) logits = model(input_ids, segment_ids, input_mask) logits = logits.detach().cpu().numpy() label_ids = label_ids.to('cpu').numpy() tmp_eval_accuracy = accuracy(logits, label_ids) eval_loss += tmp_eval_loss.mean().item() eval_accuracy += tmp_eval_accuracy nb_eval_examples += input_ids.size(0) nb_eval_steps += 1 eval_loss = eval_loss / nb_eval_steps eval_accuracy = eval_accuracy / nb_eval_examples result = {'eval_loss': eval_loss, 'eval_accuracy': eval_accuracy, 'global_step': global_step, 'loss': tr_loss/nb_tr_steps} output_eval_file = os.path.join(args.output_dir, "eval_results.txt") with open(output_eval_file, "w") as writer: logger.info("***** Eval results *****") for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key])))