params += list(model_tag.parameters()) if opt.task_sc: params += list(model_class.parameters()) params = list(filter(lambda p: p.requires_grad, params)) named_params = [] named_params += list(model_tag.named_parameters()) if opt.task_sc: named_params += list(model_class.named_parameters()) named_params = list(filter(lambda p: p[1].requires_grad, named_params)) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [ {'params': [p for n, p in named_params if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01}, {'params': [p for n, p in named_params if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} ] num_train_optimization_steps = len(train_feats['data']) // opt.batchSize * opt.max_epoch optimizer = AdamW(optimizer_grouped_parameters, lr=opt.lr, correct_bias=False) # To reproduce BertAdam specific behavior set correct_bias=False scheduler = WarmupLinearSchedule(optimizer, warmup_steps=int(opt.warmup_proportion * num_train_optimization_steps), t_total=num_train_optimization_steps) # PyTorch scheduler # prepare_inputs_for_bert(sentences, word_lengths) def decode(data_feats, data_tags, data_class, output_path): data_index = np.arange(len(data_feats)) losses = [] TP, FP, FN, TN = 0.0, 0.0, 0.0, 0.0 TP2, FP2, FN2, TN2 = 0.0, 0.0, 0.0, 0.0 with open(output_path, 'w') as f: for j in range(0, len(data_index), opt.test_batchSize): if opt.testing: words, tags, raw_tags, classes, raw_classes, lens, line_nums = data_reader.get_minibatch_with_class(data_feats, data_tags, data_class, tag_to_idx, class_to_idx, data_index, j, opt.test_batchSize, add_start_end=opt.bos_eos, multiClass=opt.multiClass, keep_order=opt.testing, enc_dec_focus=opt.enc_dec, device=opt.device) else: words, tags, raw_tags, classes, raw_classes, lens = data_reader.get_minibatch_with_class(data_feats, data_tags, data_class, tag_to_idx, class_to_idx, data_index, j, opt.test_batchSize, add_start_end=opt.bos_eos, multiClass=opt.multiClass, keep_order=opt.testing, enc_dec_focus=opt.enc_dec, device=opt.device)
params.train_size = train_data['size'] params.val_size = val_data['size'] logging.info("Loading BERT model...") # Prepare model model = BertForSequenceTagging.from_pretrained(bert_class, num_labels=len(params.tag2idx)) model.to(params.device) # Prepare optimizer if params.full_finetuning: param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [ {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': params.weight_decay}, {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} ] else: # only finetune the head classifier param_optimizer = list(model.classifier.named_parameters()) optimizer_grouped_parameters = [{'params': [p for n, p in param_optimizer]}] optimizer = AdamW(optimizer_grouped_parameters, lr=params.learning_rate, correct_bias=False) train_steps_per_epoch = params.train_size // params.batch_size scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=train_steps_per_epoch, num_training_steps=params.epoch_num * train_steps_per_epoch) # Train and evaluate the model logging.info("Starting training for {} epoch(s)".format(params.epoch_num)) train_and_evaluate(model, train_data, val_data, optimizer, scheduler, params, tagger_model_dir, args.restore_dir)
lr=opt.lr, betas=(0.9, 0.999), eps=1e-8, weight_decay=opt.l2) elif opt.optim_choice.lower() == 'bertadam': num_train_optimization_steps = ( len(train_dataloader.dataset) // opt.batchSize + 1) * opt.max_epoch opt.optimizer = BertAdam(optimizer_grouped_parameters, lr=opt.lr, warmup=opt.warmup_proportion, t_total=num_train_optimization_steps) elif opt.optim_choice.lower() == 'adamw': num_train_optimization_steps = ( len(train_dataloader.dataset) // opt.batchSize + 1) * opt.max_epoch opt.optimizer = AdamW(optimizer_grouped_parameters, lr=opt.lr, correct_bias=False) opt.scheduler = get_linear_schedule_with_warmup( opt.optimizer, num_warmup_steps=int(opt.warmup_proportion * num_train_optimization_steps), num_training_steps=num_train_optimization_steps ) # PyTorch scheduler # loss functions opt.class_loss_function = nn.BCELoss(reduction='sum') opt.nll_loss_function = nn.NLLLoss(reduction='sum', ignore_index=Constants.PAD) # training or testing if opt.testing:
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument("--task_name", default=None, type=str, required=True, help="The name of the task to train.") ## Other parameters parser.add_argument( "--cache_dir", default="", type=str, help= "Where do you want to store the pre-trained models downloaded from s3") parser.add_argument( "--max_seq_length", default=128, type=int, help= "The maximum total input sequence length after WordPiece tokenization. \n" "Sequences longer than this will be truncated, and sequences shorter \n" "than this will be padded.") parser.add_argument("--do_train", action='store_true', help="Whether to run training.") parser.add_argument('--kshot', type=int, default=5, help="random seed for initialization") parser.add_argument("--do_eval", action='store_true', help="Whether to run eval on the dev set.") parser.add_argument( "--do_lower_case", action='store_true', help="Set this flag if you are using an uncased model.") parser.add_argument("--train_batch_size", default=16, type=int, help="Total batch size for training.") parser.add_argument("--eval_batch_size", default=64, type=int, help="Total batch size for eval.") parser.add_argument("--learning_rate", default=1e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument( "--warmup_proportion", default=0.1, type=float, help= "Proportion of training to perform linear learning rate warmup for. " "E.g., 0.1 = 10%% of training.") parser.add_argument("--no_cuda", action='store_true', help="Whether not to use CUDA when available") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument('--beta_sampling_times', type=int, default=15, help="random seed for initialization") parser.add_argument('--batch_mix_times', type=int, default=400, help="random seed for initialization") parser.add_argument( '--gradient_accumulation_steps', type=int, default=1, help= "Number of updates steps to accumulate before performing a backward/update pass." ) parser.add_argument( '--fp16', action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument( '--loss_scale', type=float, default=0, help= "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.\n") parser.add_argument('--server_ip', type=str, default='', help="Can be used for distant debugging.") parser.add_argument('--server_port', type=str, default='', help="Can be used for distant debugging.") args = parser.parse_args() processors = {"rte": RteProcessor} output_modes = {"rte": "classification"} if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logger.info( "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}". format(device, n_gpu, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(args.gradient_accumulation_steps)) args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if not args.do_train and not args.do_eval: raise ValueError( "At least one of `do_train` or `do_eval` must be True.") task_name = args.task_name.lower() if task_name not in processors: raise ValueError("Task not found: %s" % (task_name)) processor = processors[task_name]() output_mode = output_modes[task_name] train_examples = processor.get_RTE_as_train_k_shot( '/export/home/Dataset/glue_data/RTE/train.tsv', args.kshot) #train_pu_half_v1.txt dev_examples = processor.get_RTE_as_dev( '/export/home/Dataset/glue_data/RTE/dev.tsv') test_examples = processor.get_RTE_as_test( '/export/home/Dataset/RTE/test_RTE_1235.txt') label_list = ["entailment", "not_entailment"] num_labels = len(label_list) print('num_labels:', num_labels, 'training size:', len(train_examples), 'dev size:', len(dev_examples), 'test size:', len(test_examples)) num_train_optimization_steps = None num_train_optimization_steps = int( len(train_examples) / args.train_batch_size / args.gradient_accumulation_steps) * args.num_train_epochs if args.local_rank != -1: num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size( ) model = RobertaForSequenceClassification(num_labels) tokenizer = RobertaTokenizer.from_pretrained( pretrain_model_dir, do_lower_case=args.do_lower_case) model.to(device) param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate) global_step = 0 nb_tr_steps = 0 tr_loss = 0 max_test_acc = 0.0 max_dev_acc = 0.0 if args.do_train: train_features = convert_examples_to_features( train_examples, label_list, args.max_seq_length, tokenizer, output_mode, cls_token_at_end= False, #bool(args.model_type in ['xlnet']), # xlnet has a cls token at the end cls_token=tokenizer.cls_token, cls_token_segment_id=0, #2 if args.model_type in ['xlnet'] else 0, sep_token=tokenizer.sep_token, sep_token_extra= True, #bool(args.model_type in ['roberta']), # roberta uses an extra separator b/w pairs of sentences, cf. github.com/pytorch/fairseq/commit/1684e166e3da03f5b600dbb7855cb98ddfcd0805 pad_on_left= False, #bool(args.model_type in ['xlnet']), # pad on the left for xlnet pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token ])[0], pad_token_segment_id=0 ) #4 if args.model_type in ['xlnet'] else 0,) '''load dev set''' dev_features = convert_examples_to_features( dev_examples, label_list, args.max_seq_length, tokenizer, output_mode, cls_token_at_end= False, #bool(args.model_type in ['xlnet']), # xlnet has a cls token at the end cls_token=tokenizer.cls_token, cls_token_segment_id=0, #2 if args.model_type in ['xlnet'] else 0, sep_token=tokenizer.sep_token, sep_token_extra= True, #bool(args.model_type in ['roberta']), # roberta uses an extra separator b/w pairs of sentences, cf. github.com/pytorch/fairseq/commit/1684e166e3da03f5b600dbb7855cb98ddfcd0805 pad_on_left= False, #bool(args.model_type in ['xlnet']), # pad on the left for xlnet pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token ])[0], pad_token_segment_id=0 ) #4 if args.model_type in ['xlnet'] else 0,) dev_all_input_ids = torch.tensor([f.input_ids for f in dev_features], dtype=torch.long) dev_all_input_mask = torch.tensor([f.input_mask for f in dev_features], dtype=torch.long) dev_all_segment_ids = torch.tensor( [f.segment_ids for f in dev_features], dtype=torch.long) dev_all_label_ids = torch.tensor([f.label_id for f in dev_features], dtype=torch.long) dev_data = TensorDataset(dev_all_input_ids, dev_all_input_mask, dev_all_segment_ids, dev_all_label_ids) dev_sampler = SequentialSampler(dev_data) dev_dataloader = DataLoader(dev_data, sampler=dev_sampler, batch_size=args.eval_batch_size) '''load test set''' test_features = convert_examples_to_features( test_examples, label_list, args.max_seq_length, tokenizer, output_mode, cls_token_at_end= False, #bool(args.model_type in ['xlnet']), # xlnet has a cls token at the end cls_token=tokenizer.cls_token, cls_token_segment_id=0, #2 if args.model_type in ['xlnet'] else 0, sep_token=tokenizer.sep_token, sep_token_extra= True, #bool(args.model_type in ['roberta']), # roberta uses an extra separator b/w pairs of sentences, cf. github.com/pytorch/fairseq/commit/1684e166e3da03f5b600dbb7855cb98ddfcd0805 pad_on_left= False, #bool(args.model_type in ['xlnet']), # pad on the left for xlnet pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token ])[0], pad_token_segment_id=0 ) #4 if args.model_type in ['xlnet'] else 0,) eval_all_input_ids = torch.tensor([f.input_ids for f in test_features], dtype=torch.long) eval_all_input_mask = torch.tensor( [f.input_mask for f in test_features], dtype=torch.long) eval_all_segment_ids = torch.tensor( [f.segment_ids for f in test_features], dtype=torch.long) eval_all_label_ids = torch.tensor([f.label_id for f in test_features], dtype=torch.long) eval_data = TensorDataset(eval_all_input_ids, eval_all_input_mask, eval_all_segment_ids, eval_all_label_ids) eval_sampler = SequentialSampler(eval_data) test_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_examples)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_optimization_steps) all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long) all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.long) train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) train_sampler = RandomSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) iter_co = 0 final_test_performance = 0.0 for epoch_i in trange(int(args.num_train_epochs), desc="Epoch"): tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 for step, batch in enumerate( tqdm(train_dataloader, desc="Iteration")): model.train() batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, label_ids = batch real_batch_size = input_ids.shape[0] '''use mixup???''' if epoch_i < 20: '''pretraining''' use_mixup = 'pretrain' lambda_vec = torch.rand(args.batch_mix_times, real_batch_size).to(device) lambda_matrix = nn.Softmax(dim=1)( lambda_vec) #(mix_time, batch_size) logits = model(input_ids, input_mask, None, lambda_matrix, None, is_train=use_mixup) loss_fct = CrossEntropyLoss(reduction='none') mixup_logits = logits.view(-1, num_labels) #(mixup_times, 2) mixup_logits_repeat = tile( mixup_logits, 0, real_batch_size ) #torch.repeat_interleave(mixup_logits, repeats=real_batch_size, dim=0) #(mixup_times*batch_size, 2) label_id_repeat = label_ids.view(-1).repeat( args.batch_mix_times ) #(0,1,2,..batch, 0, 1,2,3...batch) mixup_loss_repeat = loss_fct( mixup_logits_repeat.view(-1, num_labels), label_id_repeat.view(-1)) mixup_loss = torch.sum(mixup_loss_repeat.view( args.batch_mix_times, real_batch_size) * lambda_matrix, dim=1) #(mixup_time) loss = mixup_loss.mean() loss.backward() optimizer.step() optimizer.zero_grad() else: '''fine-tuning''' use_mixup = 'finetune' for _ in range(args.beta_sampling_times): lambda_vec = beta.rvs(0.4, 0.4, size=1)[0] logits = model(input_ids, input_mask, label_ids, None, lambda_vec, is_train=use_mixup) loss = logits loss.backward() optimizer.step() optimizer.zero_grad() ''' start evaluate on dev set after this epoch ''' model.eval() for idd, dev_or_test_dataloader in enumerate( [dev_dataloader, test_dataloader]): if idd == 0: logger.info("***** Running dev *****") logger.info(" Num examples = %d", len(dev_examples)) else: logger.info("***** Running test *****") logger.info(" Num examples = %d", len(test_examples)) # logger.info(" Batch size = %d", args.eval_batch_size) eval_loss = 0 nb_eval_steps = 0 preds = [] gold_label_ids = [] # print('Evaluating...') for input_ids, input_mask, segment_ids, label_ids in dev_or_test_dataloader: input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) label_ids = label_ids.to(device) gold_label_ids += list(label_ids.detach().cpu().numpy()) with torch.no_grad(): logits = model(input_ids, input_mask, None, None, None, is_train='test') if len(preds) == 0: preds.append(logits.detach().cpu().numpy()) else: preds[0] = np.append(preds[0], logits.detach().cpu().numpy(), axis=0) preds = preds[0] pred_probs = softmax(preds, axis=1) pred_label_ids = list(np.argmax(pred_probs, axis=1)) gold_label_ids = gold_label_ids assert len(pred_label_ids) == len(gold_label_ids) hit_co = 0 for k in range(len(pred_label_ids)): if pred_label_ids[k] == gold_label_ids[k]: hit_co += 1 test_acc = hit_co / len(gold_label_ids) if idd == 0: # this is dev if test_acc > max_dev_acc: max_dev_acc = test_acc print('\ndev acc:', test_acc, ' max_dev_acc:', max_dev_acc, '\n') else: print('\ndev acc:', test_acc, ' max_dev_acc:', max_dev_acc, '\n') break else: # this is test if test_acc > max_test_acc: max_test_acc = test_acc final_test_performance = test_acc print('\ntest acc:', test_acc, ' max_test_acc:', max_test_acc, '\n') print('final_test_performance:', final_test_performance)