def create_and_check_distilbert_for_multiple_choice( self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels ): config.num_choices = self.num_choices model = DistilBertForMultipleChoice(config=config) model.to(torch_device) model.eval() multiple_choice_inputs_ids = input_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous() multiple_choice_input_mask = input_mask.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous() result = model( multiple_choice_inputs_ids, attention_mask=multiple_choice_input_mask, labels=choice_labels, ) self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_choices))
def create_and_check_distilbert_for_multiple_choice( self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels): config.num_choices = self.num_choices model = DistilBertForMultipleChoice(config=config) model.to(torch_device) model.eval() multiple_choice_inputs_ids = input_ids.unsqueeze(1).expand( -1, self.num_choices, -1).contiguous() multiple_choice_input_mask = input_mask.unsqueeze(1).expand( -1, self.num_choices, -1).contiguous() loss, logits = model( multiple_choice_inputs_ids, attention_mask=multiple_choice_input_mask, labels=choice_labels, ) result = { "loss": loss, "logits": logits, } self.parent.assertListEqual(list(result["logits"].size()), [self.batch_size, self.num_choices]) self.check_loss_output(result)
def main(): # num_train_epochs = 8 # train_batch_size = 8 # max_seq_length = 512 # learning_rate = 1e-5 # warmup_proportion = 0.1 # gradient_accumulation_steps = 4 # data_dir = './Dataset/RACE/RACE/' parser = argparse.ArgumentParser() ## Required parameters parser.add_argument("--data_dir", default=None, type=str, required=True, help="The input data dir. Should contain the .csv files (or other data files) for the task.") parser.add_argument("--bert_model", default=None, type=str, required=True, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese.") parser.add_argument("--output_dir", default=None, type=str, required=True, help="The output directory where the model checkpoints will be written.") parser.add_argument("--bert_type", default=None, type=int, required=True, help="0:bert, 1: distilbert") ## Other parameters parser.add_argument("--max_seq_length", default=128, type=int, help="The maximum total input sequence length after WordPiece tokenization. \n" "Sequences longer than this will be truncated, and sequences shorter \n" "than this will be padded.") parser.add_argument("--do_train", default=False, action='store_true', help="Whether to run training.") parser.add_argument("--do_eval", default=False, action='store_true', help="Whether to run eval on the dev set.") parser.add_argument("--do_lower_case", default=False, action='store_true', help="Set this flag if you are using an uncased model.") parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.") parser.add_argument("--eval_batch_size", default=8, type=int, help="Total batch size for eval.") parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument("--warmup_proportion", default=0.1, type=float, help="Proportion of training to perform linear learning rate warmup for. " "E.g., 0.1 = 10%% of training.") parser.add_argument("--no_cuda", default=False, action='store_true', help="Whether not to use CUDA when available") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument('--gradient_accumulation_steps', type=int, default=1, help="Number of updates steps to accumulate before performing a backward/update pass.") args = parser.parse_args() if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logger.info("device: {} n_gpu: {}, distributed training: {}".format( device, n_gpu, bool(args.local_rank != -1))) args.train_batch_size = int(args.train_batch_size / args.gradient_accumulation_steps) random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if not args.do_train and not args.do_eval: raise ValueError("At least one of `do_train` or `do_eval` must be True.") if os.path.exists(args.output_dir) and os.listdir(args.output_dir): if args.do_train: raise ValueError("Output directory ({}) already exists and is not empty.".format(args.output_dir)) else: os.makedirs(args.output_dir, exist_ok=True) tokenizer = None if args.bert_type == 0: tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) elif args.bert_type == 1: tokenizer = DistilBertTokenizerFast.from_pretrained(args.bert_model) train_examples = None num_train_steps = None if args.do_train: train_dir = os.path.join(args.data_dir, 'train') train_examples = read_race_examples([train_dir + '/high', train_dir + '/middle']) num_train_steps = int( len(train_examples) / args.train_batch_size / args.gradient_accumulation_steps * args.num_train_epochs) # Prepare model model = None if args.bert_type == 0: model = BertForMultipleChoice.from_pretrained(args.bert_model, cache_dir=PYTORCH_PRETRAINED_BERT_CACHE / 'distributed_{}'.format( args.local_rank), num_choices=4) elif args.bert_type == 1: model = DistilBertForMultipleChoice.from_pretrained(args.bert_model) model.to(device) if n_gpu > 1: model = torch.nn.DataParallel(model) # Prepare optimizer param_optimizer = list(model.named_parameters()) # hack to remove pooler, which is not used # thus it produce None grad that break apex param_optimizer = [n for n in param_optimizer if 'pooler' not in n[0]] no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [ {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01}, {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} ] t_total = num_train_steps if args.local_rank != -1: t_total = t_total // torch.distributed.get_world_size() optimizer = None if args.bert_type == 0: optimizer = BertAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=t_total) elif args.bert_type == 1: optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate) global_step = 0 if args.do_train: train_features = convert_examples_to_features( train_examples, tokenizer, args.max_seq_length, True) logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_examples)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_steps) all_input_ids = torch.tensor(select_field(train_features, 'input_ids'), dtype=torch.long) all_input_mask = torch.tensor(select_field(train_features, 'input_mask'), dtype=torch.long) all_segment_ids = torch.tensor(select_field(train_features, 'segment_ids'), dtype=torch.long) all_label = torch.tensor([f.label for f in train_features], dtype=torch.long) train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label) if args.local_rank == -1: train_sampler = RandomSampler(train_data) else: train_sampler = DistributedSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) model.train() for ep in range(int(args.num_train_epochs)): tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 count = 0 logger.info("Trianing Epoch: {}/{}".format(ep + 1, int(args.num_train_epochs))) for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration")): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, label_ids = batch if args.bert_type == 0: loss = model(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask, labels=label_ids) elif args.bert_type == 1: result = model(input_ids=input_ids, # token_type_ids=segment_ids, attention_mask=input_mask, labels=label_ids) loss = result['loss'] logits = result['logits'] ####### compare = np.array(label_ids.cpu()) == np.array(logits.argmax(axis=1).cpu()) count += np.sum(compare) print("\nLabel: {}, Prediction: {}, Accuracy: {}" .format(label_ids, logits.argmax(axis=1), count / (args.train_batch_size * (step + 1)))) ####### if n_gpu > 1: loss = loss.mean() if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps tr_loss += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 loss.backward() if (step + 1) % args.gradient_accumulation_steps == 0: # modify learning rate with special warm up BERT uses lr_this_step = args.learning_rate * warmup_linear(global_step/t_total, args.warmup_proportion) for param_group in optimizer.param_groups: param_group['lr'] = lr_this_step optimizer.step() optimizer.zero_grad() global_step += 1 if global_step%100 == 0: logger.info("Training loss: {}, global step: {}".format(tr_loss/nb_tr_steps, global_step)) ## evaluate on dev set if global_step % 1000 == 0: dev_dir = os.path.join(args.data_dir, 'dev') dev_set = [dev_dir+'/high', dev_dir+'/middle'] eval_examples = read_race_examples(dev_set) eval_features = convert_examples_to_features( eval_examples, tokenizer, args.max_seq_length, True) logger.info("***** Running evaluation: Dev *****") logger.info(" Num examples = %d", len(eval_examples)) logger.info(" Batch size = %d", args.eval_batch_size) all_input_ids = torch.tensor(select_field(eval_features, 'input_ids'), dtype=torch.long) all_input_mask = torch.tensor(select_field(eval_features, 'input_mask'), dtype=torch.long) all_segment_ids = torch.tensor(select_field(eval_features, 'segment_ids'), dtype=torch.long) all_label = torch.tensor([f.label for f in eval_features], dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label) # Run prediction for full data eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) model.eval() eval_loss, eval_accuracy = 0, 0 nb_eval_steps, nb_eval_examples = 0, 0 for step, batch in enumerate(eval_dataloader): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, label_ids = batch with torch.no_grad(): if args.bert_type == 0: tmp_eval_loss = model(input_ids, segment_ids, input_mask, label_ids) logits = model(input_ids, segment_ids, input_mask) elif args.bert_type == 1: result = model(input_ids=input_ids, attention_mask=input_mask, labels=label_ids) tmp_eval_loss = result['loss'] logits = result['logits'] logits = logits.argmax(axis=1) logits = logits.detach().cpu().numpy() label_ids = label_ids.to('cpu').numpy() tmp_eval_accuracy = accuracy(logits, label_ids) eval_loss += tmp_eval_loss.mean().item() eval_accuracy += tmp_eval_accuracy nb_eval_examples += input_ids.size(0) nb_eval_steps += 1 eval_loss = eval_loss / nb_eval_steps eval_accuracy = eval_accuracy / nb_eval_examples result = {'dev_eval_loss': eval_loss, 'dev_eval_accuracy': eval_accuracy, 'global_step': global_step, 'loss': tr_loss/nb_tr_steps} output_eval_file = os.path.join(args.output_dir, "eval_results.txt") with open(output_eval_file, "a+") as writer: logger.info("***** Dev results *****") for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) # Save a trained model model_to_save = model.module if hasattr(model, 'module') else model # Only save the model it-self output_model_file = os.path.join(args.output_dir, "pytorch_model.bin") torch.save(model_to_save.state_dict(), output_model_file) model.load_state_dict(torch.load(os.path.join(args.output_dir, "pytorch_model.bin"))) if args.do_eval and args.local_rank == -1: test_dir = os.path.join(args.data_dir, 'test') test_high = [test_dir + '/high'] test_middle = [test_dir + '/middle'] ## test high eval_examples = read_race_examples(test_high) eval_features = convert_examples_to_features( eval_examples, tokenizer, args.max_seq_length, True) logger.info("***** Running evaluation: test high *****") logger.info(" Num examples = %d", len(eval_examples)) logger.info(" Batch size = %d", args.eval_batch_size) all_input_ids = torch.tensor(select_field(eval_features, 'input_ids'), dtype=torch.long) all_input_mask = torch.tensor(select_field(eval_features, 'input_mask'), dtype=torch.long) all_segment_ids = torch.tensor(select_field(eval_features, 'segment_ids'), dtype=torch.long) all_label = torch.tensor([f.label for f in eval_features], dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label) # Run prediction for full data eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) # model.eval() high_eval_loss, high_eval_accuracy = 0, 0 high_nb_eval_steps, high_nb_eval_examples = 0, 0 for step, batch in enumerate(eval_dataloader): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, label_ids = batch with torch.no_grad(): if args.bert_type == 0: tmp_eval_loss = model(input_ids, segment_ids, input_mask, label_ids) logits = model(input_ids, segment_ids, input_mask) elif args.bert_type == 1: result = model(input_ids=input_ids, attention_mask=input_mask, labels=label_ids) tmp_eval_loss = result['loss'] logits = result['logits'] # logits = logits.argmax(axis=1) logits = logits.detach().cpu().numpy() label_ids = label_ids.to('cpu').numpy() tmp_eval_accuracy = accuracy(logits, label_ids) high_eval_loss += tmp_eval_loss.mean().item() high_eval_accuracy += tmp_eval_accuracy high_nb_eval_examples += input_ids.size(0) high_nb_eval_steps += 1 eval_loss = high_eval_loss / high_nb_eval_steps eval_accuracy = high_eval_accuracy / high_nb_eval_examples result = {'high_eval_loss': eval_loss, 'high_eval_accuracy': eval_accuracy} output_eval_file = os.path.join(args.output_dir, "eval_results.txt") with open(output_eval_file, "a+") as writer: logger.info("***** Eval results *****") for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) ## test middle eval_examples = read_race_examples(test_middle) eval_features = convert_examples_to_features( eval_examples, tokenizer, args.max_seq_length, True) logger.info("***** Running evaluation: test middle *****") logger.info(" Num examples = %d", len(eval_examples)) logger.info(" Batch size = %d", args.eval_batch_size) all_input_ids = torch.tensor(select_field(eval_features, 'input_ids'), dtype=torch.long) all_input_mask = torch.tensor(select_field(eval_features, 'input_mask'), dtype=torch.long) all_segment_ids = torch.tensor(select_field(eval_features, 'segment_ids'), dtype=torch.long) all_label = torch.tensor([f.label for f in eval_features], dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label) # Run prediction for full data eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) model.eval() middle_eval_loss, middle_eval_accuracy = 0, 0 middle_nb_eval_steps, middle_nb_eval_examples = 0, 0 for step, batch in enumerate(eval_dataloader): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, label_ids = batch with torch.no_grad(): if args.bert_type == 0: results = model(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask, labels=label_ids) elif args.bert_type == 1: results = model(input_ids=input_ids, attention_mask=input_mask, labels=label_ids) tmp_eval_loss = results['loss'] logits = results['logits'] logits = logits.detach().cpu().numpy() label_ids = label_ids.to('cpu').numpy() tmp_eval_accuracy = accuracy(logits, label_ids) middle_eval_loss += tmp_eval_loss.mean().item() middle_eval_accuracy += tmp_eval_accuracy middle_nb_eval_examples += input_ids.size(0) middle_nb_eval_steps += 1 eval_loss = middle_eval_loss / middle_nb_eval_steps eval_accuracy = middle_eval_accuracy / middle_nb_eval_examples result = {'middle_eval_loss': eval_loss, 'middle_eval_accuracy': eval_accuracy} output_eval_file = os.path.join(args.output_dir, "eval_results.txt") with open(output_eval_file, "a+") as writer: for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) # all test eval_loss = (middle_eval_loss + high_eval_loss) / (middle_nb_eval_steps + high_nb_eval_steps) eval_accuracy = (middle_eval_accuracy + high_eval_accuracy) / (middle_nb_eval_examples + high_nb_eval_examples) result = {'overall_eval_loss': eval_loss, 'overall_eval_accuracy': eval_accuracy} with open(output_eval_file, "a+") as writer: for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key])))
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu') train_examples = read_mctest_examples([train_dir + '.tsv', train_dir + '.ans']) num_train_steps = int( len(train_examples) / train_batch_size / gradient_accumulation_steps * num_train_epochs) tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased') train_features = convert_examples_to_features( train_examples, tokenizer, max_seq_length, True) train_data = MCTestDataset(train_features) train_sampler = RandomSampler(train_data) train_loader = DataLoader(train_data, sampler=train_sampler, batch_size=train_batch_size) model = DistilBertForMultipleChoice.from_pretrained('distilbert-base-uncased') model.to(device) model = torch.nn.DataParallel(model) optimizer = AdamW(model.parameters(), lr=learning_rate) global_step = 0 count = 0 model.train() for epoch in range(num_train_epochs): tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 print("Training Epoch: {}/{}".format(epoch + 1, int(num_train_epochs))) for step, batch in enumerate(train_loader):