class OneCycleOptimizer(BertAdam): def __init__(self, params, start, stop, warmup=-1, t_total=-1, schedule='warmup_linear', b1=0.9, b2=0.999, e=1e-6, weight_decay_rate=0.01, steps=200, max_grad_norm=1.0): defaults = dict(lr=start, schedule=schedule, warmup=warmup, t_total=t_total, b1=b1, b2=b2, e=e, weight_decay_rate=weight_decay_rate, max_grad_norm=max_grad_norm) self.bertadam = BertAdam(params, **defaults) self.dir = 1 self.start = start self.stop = stop self.curr = start self.incr = 2 * (stop - start) / steps super(OneCycleOptimizer, self).__init__(params, **defaults) def get_lr(self): lr_list = self.bertadam.get_lr() if self.dir == 1: self.curr += self.incr if self.curr > self.stop: self.dir = -1 elif self.dir == -1: self.curr -= self.incr if self.curr < self.start: self.dir = 1 return lr_list / self.curr
def main(): parser = argparse.ArgumentParser() parser.add_argument("--examples_n_features_dir", default='data/examples_n_features/', type=str, help="Dir containing drop examples and features.") parser.add_argument("--mlm_dir", default='../data/MLM_train/', type=Path, help="The data dir with MLM taks. Should contain the .jsonl files (or other data files) for the task.") parser.add_argument("--bert_model", default='bert-base-uncased', type=str, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, " "bert-base-multilingual-cased, bert-base-chinese.") parser.add_argument("--output_dir", default='./out_drop_finetune', type=str, help="The output directory where the model checkpoints will be written.") parser.add_argument("--model", default="bert-base-uncased", type=str) parser.add_argument("--init_weights_dir", default='', type=str, help="The directory where init model wegihts an config are stored.") parser.add_argument("--max_seq_length", default=-1, type=int, help="The maximum total input sequence length after WordPiece tokenization. \n" "Sequences longer than this will be truncated, and sequences shorter \n" "than this will be padded.") parser.add_argument("--do_train", action='store_true', help="Whether to run training.") parser.add_argument("--do_eval", action='store_true', help="Whether to run eval on the dev set.") parser.add_argument("--do_inference", action='store_true', help="Whether to run inference on the dev set.") parser.add_argument("--do_lower_case", action='store_true', help="Set this flag if you are using an uncased model.") parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.") parser.add_argument("--eval_batch_size", default=8, type=int, help="Total batch size for eval.") parser.add_argument("--mlm_batch_size", default=-1, type=int, help="Total batch size for mlm train data.") parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--num_train_samples", default=-1, type=int, help="Total number of training samples used.") parser.add_argument("--num_train_epochs", default=6.0, type=float, help="Total number of training epochs to perform.") parser.add_argument("--warmup_proportion", default=0.1, type=float, help="Proportion of training to perform linear learning rate warmup for. " "E.g., 0.1 = 10%% of training.") parser.add_argument("--no_cuda", action='store_true', help="Whether not to use CUDA when available") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument('--gradient_accumulation_steps', type=int, default=1, help="Number of updates steps to accumulate before performing a backward/update pass.") parser.add_argument('--freeze_encoder', action='store_true', help="Whether to freeze the bert encoder, embeddings.") # parser.add_argument('--indiv_digits', # action='store_true', # help="Whether to tokenize numbers as digits.") parser.add_argument('--rand_init', action='store_true', help="Whether to use random init instead of BERT.") parser.add_argument('--random_shift', action='store_true', help="Whether to randomly shift position ids of encoder input.") parser.add_argument('--fp16', action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument('--mlm_scale', type=float, default=1.0, help="mlm loss scaling factor.") parser.add_argument('--loss_scale', type=float, default=0, help="Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.\n") args = parser.parse_args() if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logger.info("device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".format( device, n_gpu, bool(args.local_rank != -1), args.fp16)) args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if not args.do_train and not args.do_eval: raise ValueError("At least one of `do_train` or `do_eval` must be True.") if args.do_train: make_output_dir(args, scripts_to_save=[sys.argv[0], 'modeling.py', 'create_examples_n_features.py']) #tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True) if args.init_weights_dir: model = BertTransformer.from_pretrained(args.init_weights_dir) else: # prepare model model = BertTransformer.from_pretrained(args.model, cache_dir=os.path.join(str(PYTORCH_PRETRAINED_BERT_CACHE), 'distributed_{}'.format(args.local_rank))) model.to(device) if n_gpu > 1: model = torch.nn.DataParallel(model) eval_data = DropDataset(args, 'eval') eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) logger.info("***** Running evaluation *****") logger.info(" Num examples = %d", len(eval_data)) logger.info(" Batch size = %d", args.eval_batch_size) if args.do_eval and args.do_inference: inference(args, model, eval_dataloader, device, tokenizer) exit() if args.do_train: # Prepare data loader train_data = DropDataset(args, 'train') train_sampler = RandomSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) num_train_optimization_steps = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs # Prepare optimizer param_optimizer = list(model.named_parameters()) # hack to remove pooler, which is not used # thus it produce None grad that break apex param_optimizer = [n for n in param_optimizer] no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [ {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01}, {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} ] optimizer = BertAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=num_train_optimization_steps) logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_data)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_optimization_steps) ''' ------------------------------------------------------------------------------ TODO: check training resume, fp16, use --random_shift for short inputs ------------------------------------------------------------------------------ ''' # using fp16 fp16 = False # try: # from apex import amp # model, optimizer = amp.initialize(model, optimizer, opt_level="O1") # fp16 = True # except ImportError: # logger.info("Not using 16-bit training due to apex import error.") # if n_gpu > 1: # model = torch.nn.DataParallel(model) tb_writer = SummaryWriter(os.path.join(args.output_dir, 'log')) # tensorboard # masked LM data do_mlm_task = False if args.mlm_batch_size > 0: mlm_dataset = MLMDataset(training_path=args.mlm_dir, tokenizer=tokenizer) mlm_dataloader = DataLoader(mlm_dataset, sampler=RandomSampler(mlm_dataset), batch_size=args.mlm_batch_size) mlm_iter = iter(mlm_dataloader) do_mlm_task = True model.train() (global_step, all_losses, all_errors, all_dec_losses, all_dec_errors, eval_errors, best, best_mlm, t_prev, do_eval) = 0, [], [], [], [], [], 1000, 1000, time.time(), False mlm_losses, mlm_errors, all_span_losses, all_span_errors = [], [], [], [] for epoch in trange(int(args.num_train_epochs), desc="Epoch"): for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration")): # grads wrt to train data batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, label_ids, head_type, q_spans, p_spans = batch losses = model(input_ids, segment_ids, input_mask, random_shift=args.random_shift, target_ids=label_ids, target_mask=None, answer_as_question_spans=q_spans, answer_as_passage_spans=p_spans, head_type=head_type) loss, errs, dec_loss, dec_errors, span_loss, span_errors, type_loss, type_errors, type_preds = losses # aggregate on multi-gpu take_mean = lambda x: x.mean() if x is not None and sum(x.size()) > 1 else x take_sum = lambda x: x.sum() if x is not None and sum(x.size()) > 1 else x [loss, dec_loss, span_loss, type_loss] = list(map(take_mean, [loss, dec_loss, span_loss, type_loss])) [errs, dec_errors, span_errors, type_errors] = list(map(take_sum, [errs, dec_errors, span_errors, type_errors])) if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps all_losses.append(loss.item()); all_dec_losses.append(dec_loss.item()); all_errors.append(errs.item() / input_ids.size(0)) all_dec_errors.append(dec_errors.item() / input_ids.size(0)) all_span_losses.append(span_loss.item()); all_span_errors.append(span_errors.item() / input_ids.size(0)) if fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() if do_mlm_task: # grads wrt to mlm data while True: try: batch = next(mlm_iter) # sample next mlm batch break except StopIteration: # end of epoch: reset and shuffle mlm_iter = iter(mlm_dataloader) batch = tuple(t.to(device) for t in batch) input_ids, input_mask, label_ids = batch loss, errs = model(input_ids, None, input_mask, target_ids=label_ids, ignore_idx=IGNORE_IDX, task='mlm') loss, err_sum = take_mean(loss), take_sum(errs) # for multi-gpu if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps loss = args.mlm_scale * loss mlm_losses.append(loss.item()); mlm_errors.append(err_sum.item() / input_ids.size(0)) if fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() else: mlm_losses.append(-1); mlm_errors.append(-1) if (step + 1) % args.gradient_accumulation_steps == 0: optimizer.step() # update step optimizer.zero_grad() global_step += 1 train_result = {'trn_loss': all_losses[-1], 'trn_dec_loss': all_dec_losses[-1], 'trn_err': all_errors[-1], 'trn_dec_err': all_dec_errors[-1], 'lr': optimizer.get_lr()[0], 'trn_span_loss': all_span_losses[-1], 'trn_span_err': all_span_errors[-1], 'epoch': epoch} mlm_result = {'trn_mlm_loss': mlm_losses[-1], 'trn_mlm_err': mlm_errors[-1]} tb_writer.add_scalars('train', train_result, len(all_losses)) tb_writer.add_scalars('mlm', mlm_result, len(all_losses)) if do_mlm_task else None if time.time() - t_prev > 60*60: # evaluate every hr do_eval = True t_prev = time.time() if do_eval: do_eval = False eval_result = evaluate(args, model, eval_dataloader, device, len(train_data)) eval_err = eval_result['eval_err'] if eval_err < best or (eval_err < best + 0.005 and np.mean(mlm_errors[-1000:]) < best_mlm): # if eval err is in range of best, look at MLM err train_state = {'global_step': global_step, 'optimizer_state_dict': optimizer.state_dict()} train_state.update(train_result) save(args, model, tokenizer, train_state) best_mlm = min(best_mlm, np.mean(mlm_errors[-1000:])) best = min(best, eval_err) eval_errors.append((len(all_losses), eval_err)) model.train() tb_writer.add_scalars('eval', eval_result, len(all_losses)) # for name, param in model.named_parameters(): # tb_writer.add_histogram(name, param.clone().cpu().data.numpy(), len(all_losses)) # end of epoch do_eval = True # training complete tb_writer.export_scalars_to_json(os.path.join(args.output_dir, 'training_scalars.json')) tb_writer.close()
def do_training(train_fs, train_exs): """Runs BERT fine-tuning.""" # Allows to write to enclosed variables global_step nonlocal global_step # Create the batched training data out of the features. train_data = create_tensor_dataset(train_fs) train_sampler = RandomSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) # Calculate the number of optimization steps. num_train_optimization_steps = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs # Prepare optimizer. param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [ {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01}, {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} ] optimizer = BertAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=num_train_optimization_steps) # Log some information about the training. logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_exs)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_optimization_steps) # Set the model to training mode and train for X epochs. model.train() for _ in trange(int(args.num_train_epochs), desc="Epoch"): tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 # Iterate over all batches. for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration")): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, label_ids = batch # Get the Logits and calculate the loss. logits = model(input_ids, token_type_ids=segment_ids, attention_mask=input_mask) loss = CrossEntropyLoss()(logits.view(-1, num_labels), label_ids.view(-1)) # Scale the loss in gradient accumulation mode. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps # Calculate the gradients. loss.backward() tr_loss += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 # Update the weights every gradient_accumulation_steps steps. if (step + 1) % args.gradient_accumulation_steps == 0: optimizer.step() optimizer.zero_grad() global_step += 1 tb_writer.add_scalar('lr', optimizer.get_lr()[0], global_step) tb_writer.add_scalar('loss', loss.item(), global_step)
class BertTrainer: def __init__(self, hypers: Hypers, model_name, checkpoint, **extra_model_args): """ initialize the BertOptimizer, with common logic for setting weight_decay_rate, doing gradient accumulation and tracking loss :param hypers: the core hyperparameters for the bert model :param model_name: the fully qualified name of the bert model we will train like pytorch_pretrained_bert.modeling.BertForQuestionAnswering :param checkpoint: if resuming training, this is the checkpoint that contains the optimizer state as checkpoint['optimizer'] """ self.init_time = time.time() self.model = self.get_model(hypers, model_name, checkpoint, **extra_model_args) self.step = 0 self.hypers = hypers self.train_stats = TrainStats(hypers) self.model.train() logger.info('configured model for training') # show parameter names # logger.info(str([n for (n, p) in self.model.named_parameters()])) # Prepare optimizer if hasattr(hypers, 'exclude_pooler') and hypers.exclude_pooler: # module.bert.pooler.dense.weight, module.bert.pooler.dense.bias # see https://github.com/NVIDIA/apex/issues/131 self.param_optimizer = [ (n, p) for (n, p) in self.model.named_parameters() if '.pooler.' not in n ] else: self.param_optimizer = list(self.model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in self.param_optimizer if not any(nd in n for nd in no_decay) ], 'weight_decay_rate': 0.01 }, { 'params': [ p for n, p in self.param_optimizer if any(nd in n for nd in no_decay) ], 'weight_decay_rate': 0.0 }] self.t_total = hypers.num_train_steps self.global_step = hypers.global_step if hypers.fp16: try: from apex.optimizers import FP16_Optimizer from apex.optimizers import FusedAdam except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) optimizer = FusedAdam(optimizer_grouped_parameters, lr=hypers.learning_rate, bias_correction=False, max_grad_norm=1.0) if hypers.loss_scale == 0: self.optimizer = FP16_Optimizer( optimizer, dynamic_loss_scale=True, verbose=(hypers.global_rank == 0)) else: self.optimizer = FP16_Optimizer( optimizer, static_loss_scale=hypers.loss_scale, verbose=(hypers.global_rank == 0)) else: self.optimizer = BertAdam(optimizer_grouped_parameters, lr=hypers.learning_rate, warmup=hypers.warmup_proportion, t_total=self.t_total) logger.info('created optimizer') if checkpoint and type( checkpoint) is dict and 'optimizer' in checkpoint: self.optimizer.load_state_dict(checkpoint['optimizer']) if hypers.fp16: pass else: # if we load this state, we need to set the t_total to what we passed, not what was saved self.optimizer.set_t_total(self.t_total) # show state of optimizer lrs = self.optimizer.get_lr() logger.info('Min and max learn rate: %s', str([min(lrs), max(lrs)])) logger.info('Min and max step in state: %s', str(self.optimizer.get_steps())) instances_per_step = hypers.train_batch_size * hypers.gradient_accumulation_steps * hypers.world_size if 'seen_instances' in checkpoint: self.global_step = int(checkpoint['seen_instances'] / instances_per_step) self.train_stats.previous_instances = checkpoint[ 'seen_instances'] logger.info('got global step from checkpoint = %i', self.global_step) logger.info('Loaded optimizer state:') logger.info(repr(self.optimizer)) def reset(self): """ reset any gradient accumulation :return: """ self.model.zero_grad() self.step = 0 def should_continue(self): """ :return: True if training should continue """ if self.global_step >= self.t_total: logger.info( 'stopping due to train step %i >= target train steps %i', self.global_step, self.t_total) return False if 0 < self.hypers.time_limit <= (time.time() - self.init_time): logger.info('stopping due to time out %i seconds', self.hypers.time_limit) return False return True def save_simple(self, filename): if self.hypers.global_rank != 0: logger.info('skipping save in %i', torch.distributed.get_rank()) return model_to_save = self.model.module if hasattr( self.model, 'module') else self.model # Only save the model itself torch.save(model_to_save.state_dict(), filename) logger.info(f'saved model only to {filename}') def save(self, filename, **extra_checkpoint_info): """ save a checkpoint with the model parameters, the optimizer state and any additional checkpoint info :param filename: :param extra_checkpoint_info: :return: """ # only local_rank 0, in fact only global rank 0 if self.hypers.global_rank != 0: logger.info('skipping save in %i', torch.distributed.get_rank()) return start_time = time.time() checkpoint = extra_checkpoint_info model_to_save = self.model.module if hasattr( self.model, 'module') else self.model # Only save the model itself os.makedirs(os.path.dirname(filename), exist_ok=True) # also save the optimizer state, since we will likely resume from partial pre-training checkpoint['state_dict'] = model_to_save.state_dict() checkpoint['optimizer'] = self.optimizer.state_dict() # include world size in instances_per_step calculation instances_per_step = self.hypers.train_batch_size * \ self.hypers.gradient_accumulation_steps * \ self.hypers.world_size checkpoint['seen_instances'] = self.global_step * instances_per_step checkpoint['num_instances'] = self.t_total * instances_per_step # CONSIDER: also save hypers? torch.save(checkpoint, filename) logger.info( f'saved model to {filename} in {time.time()-start_time} seconds') def get_instance_count(self): instances_per_step = self.hypers.train_batch_size * \ self.hypers.gradient_accumulation_steps * \ self.hypers.world_size return self.global_step * instances_per_step def step_loss(self, loss): """ accumulates the gradient, tracks the loss and applies the gradient to the model :param loss: the loss from evaluating the model """ if self.global_step == 0: logger.info('first step_loss') if self.hypers.n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. self.train_stats.note_loss(loss.item()) if self.hypers.gradient_accumulation_steps > 1: loss = loss / self.hypers.gradient_accumulation_steps if self.hypers.fp16: self.optimizer.backward(loss) else: loss.backward() if (self.step + 1) % self.hypers.gradient_accumulation_steps == 0: lr_this_step = self.hypers.learning_rate * warmup_linear( self.global_step / self.t_total, self.hypers.warmup_proportion) for param_group in self.optimizer.param_groups: param_group['lr'] = lr_this_step self.optimizer.step() self.model.zero_grad() self.global_step += 1 self.step += 1 @classmethod def get_files(cls, train_file, completed_files): logger.info('completed files = %s, count = %i', str(completed_files[:min(5, len(completed_files))]), len(completed_files)) # multiple train files if not os.path.isdir(train_file): train_files = [train_file] else: if not train_file.endswith('/'): train_file = train_file + '/' train_files = glob.glob(train_file + '**', recursive=True) train_files = [f for f in train_files if not os.path.isdir(f)] # exclude completed files if not set(train_files) == set(completed_files): train_files = [f for f in train_files if f not in completed_files] else: completed_files = [] # new epoch logger.info('train files = %s, count = %i', str(train_files[:min(5, len(train_files))]), len(train_files)) return train_files, completed_files @classmethod def get_model(cls, hypers, model_name, checkpoint, **extra_model_args): override_state_dict = None if checkpoint: if type(checkpoint) is dict and 'state_dict' in checkpoint: logger.info('loading from multi-part checkpoint') override_state_dict = checkpoint['state_dict'] else: logger.info('loading from saved model parameters') override_state_dict = checkpoint # create the model object by name # https://stackoverflow.com/questions/4821104/python-dynamic-instantiation-from-string-name-of-a-class-in-dynamically-imported import importlib clsdot = model_name.rfind('.') class_ = getattr(importlib.import_module(model_name[0:clsdot]), model_name[clsdot + 1:]) model_args = { 'state_dict': override_state_dict, 'cache_dir': PYTORCH_PRETRAINED_BERT_CACHE } model_args.update(extra_model_args) # logger.info(pprint.pformat(extra_model_args, indent=4)) model = class_.from_pretrained(hypers.bert_model, **model_args) logger.info('built model') # configure model for fp16, multi-gpu and/or distributed training if hypers.fp16: model.half() logger.info('model halved') logger.info('sending model to %s', str(hypers.device)) model.to(hypers.device) logger.info('sent model to %s', str(hypers.device)) if hypers.local_rank != -1: if not hypers.no_apex: try: from apex.parallel import DistributedDataParallel as DDP model = DDP(model) except ImportError: raise ImportError("Please install apex") else: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[hypers.local_rank], output_device=hypers.local_rank) logger.info('using DistributedDataParallel for world size %i', hypers.world_size) elif hypers.n_gpu > 1: model = torch.nn.DataParallel(model) return model @classmethod def get_base_parser(cls): parser = argparse.ArgumentParser() # Required parameters parser.add_argument( "--bert_model", default=None, type=str, required=True, help= "Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese." ) # Other parameters parser.add_argument( "--num_instances", default=-1, type=int, help="Total number of training instances to train over.") parser.add_argument( "--seen_instances", default=-1, type=int, help= "When resuming training, the number of instances we have already trained over." ) parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.") parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument( "--warmup_proportion", default=0.1, type=float, help= "Proportion of training to perform linear learning rate warmup for. " "E.g., 0.1 = 10% of training.") parser.add_argument("--no_cuda", default=False, action='store_true', help="Whether not to use CUDA when available") parser.add_argument("--no_apex", default=False, action='store_true', help="Whether not to use apex when available") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument( '--gradient_accumulation_steps', type=int, default=1, help= "Number of updates steps to accumulate before performing a backward/update pass." ) parser.add_argument( '--optimize_on_cpu', default=False, action='store_true', help= "Whether to perform optimization and keep the optimizer averages on CPU" ) parser.add_argument( '--fp16', default=False, action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument( '--loss_scale', type=float, default=0, help= 'Loss scaling, positive power of 2 values can improve fp16 convergence. ' 'Leave at zero to use dynamic loss scaling') return parser
def main(): parser = argparse.ArgumentParser() # Required parameters parser.add_argument( "--bert_model", default=None, type=str, required=True, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese." ) parser.add_argument("--vocab_file", default='bert-base-uncased-vocab.txt', type=str, required=True) parser.add_argument("--model_file", default='bert-base-uncased.tar.gz', type=str, required=True) parser.add_argument( "--output_dir", default=None, type=str, required=True, help= "The output directory where the model checkpoints and predictions will be written." ) parser.add_argument( "--predict_dir", default=None, type=str, required=True, help="The output directory where the predictions will be written.") # Other parameters parser.add_argument("--train_file", default=None, type=str, help="SQuAD json for training. E.g., train-v1.1.json") parser.add_argument( "--predict_file", default=None, type=str, help="SQuAD json for predictions. E.g., dev-v1.1.json or test-v1.1.json" ) parser.add_argument("--test_file", default=None, type=str) parser.add_argument( "--max_seq_length", default=384, type=int, help= "The maximum total input sequence length after WordPiece tokenization. Sequences " "longer than this will be truncated, and sequences shorter than this will be padded." ) parser.add_argument( "--doc_stride", default=128, type=int, help= "When splitting up a long document into chunks, how much stride to take between chunks." ) parser.add_argument( "--max_query_length", default=64, type=int, help= "The maximum number of tokens for the question. Questions longer than this will " "be truncated to this length.") parser.add_argument("--do_train", default=False, action='store_true', help="Whether to run training.") parser.add_argument("--do_predict", default=False, action='store_true', help="Whether to run eval on the dev set.") parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.") parser.add_argument("--predict_batch_size", default=8, type=int, help="Total batch size for predictions.") parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--num_train_epochs", default=2.0, type=float, help="Total number of training epochs to perform.") parser.add_argument( "--warmup_proportion", default=0.1, type=float, help= "Proportion of training to perform linear learning rate warmup for. E.g., 0.1 = 10% " "of training.") parser.add_argument( "--n_best_size", default=20, type=int, help= "The total number of n-best predictions to generate in the nbest_predictions.json " "output file.") parser.add_argument( "--max_answer_length", default=30, type=int, help= "The maximum length of an answer that can be generated. This is needed because the start " "and end predictions are not conditioned on one another.") parser.add_argument( "--verbose_logging", default=False, action='store_true', help= "If true, all of the warnings related to data processing will be printed. " "A number of warnings are expected for a normal SQuAD evaluation.") parser.add_argument("--no_cuda", default=False, action='store_true', help="Whether not to use CUDA when available") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument('--view_id', type=int, default=1, help="view id of multi-view co-training(two-view)") parser.add_argument( '--gradient_accumulation_steps', type=int, default=1, help= "Number of updates steps to accumulate before performing a backward/update pass." ) parser.add_argument( "--do_lower_case", default=True, action='store_true', help= "Whether to lower case the input text. True for uncased models, False for cased models." ) parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument( '--fp16', default=False, action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument( '--loss_scale', type=float, default=0, help= "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.\n") parser.add_argument('--save_all', default=False, action='store_true') # Base setting parser.add_argument('--pretrain', type=str, default=None) parser.add_argument('--max_ctx', type=int, default=2) parser.add_argument('--task_name', type=str, default='race') parser.add_argument('--bert_name', type=str, default='pool-race') parser.add_argument('--reader_name', type=str, default='race') parser.add_argument('--per_eval_step', type=int, default=10000000) # model parameters parser.add_argument('--evidence_lambda', type=float, default=0.8) # Parameters for running labeling model parser.add_argument('--do_label', default=False, action='store_true') parser.add_argument('--sentence_id_file', nargs='*') parser.add_argument('--weight_threshold', type=float, default=0.0) parser.add_argument('--only_correct', default=False, action='store_true') parser.add_argument('--label_threshold', type=float, default=0.0) parser.add_argument('--multi_evidence', default=False, action='store_true') parser.add_argument('--metric', default='accuracy', type=str) parser.add_argument('--num_evidence', default=1, type=int) parser.add_argument('--power_length', default=1., type=float) parser.add_argument('--num_choices', default=4, type=int) parser.add_argument('--split_type', default=0, type=int) parser.add_argument('--use_gumbel', default=False, action='store_true') parser.add_argument('--sample_steps', type=int, default=10) parser.add_argument('--reward_func', type=int, default=0) parser.add_argument('--freeze_bert', default=False, action='store_true') args = parser.parse_args() logger = setting_logger(args.output_dir) logger.info('================== Program start. ========================') logger.info( f'================== Running with seed {args.seed} ==========================' ) model_params = prepare_model_params(args) read_params = prepare_read_params(args) if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logger.info( "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}". format(device, n_gpu, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(args.gradient_accumulation_steps)) args.train_batch_size = int(args.train_batch_size / args.gradient_accumulation_steps) random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if not args.do_train and not args.do_predict and not args.do_label: raise ValueError( "At least one of `do_train` or `do_predict` or `do_label` must be True." ) if args.do_train: if not args.train_file: raise ValueError( "If `do_train` is True, then `train_file` must be specified.") if args.do_predict: if not args.predict_file: raise ValueError( "If `do_predict` is True, then `predict_file` must be specified." ) if args.do_train: if os.path.exists(args.output_dir) and os.listdir(args.output_dir): raise ValueError( "Output directory () already exists and is not empty.") os.makedirs(args.output_dir, exist_ok=True) if args.do_predict or args.do_label: os.makedirs(args.predict_dir, exist_ok=True) tokenizer = BertTokenizer.from_pretrained(args.vocab_file) data_reader = initialize_reader(args.reader_name) num_train_steps = None if args.do_train or args.do_label: train_examples = data_reader.read(input_file=args.train_file, **read_params) cached_train_features_file = args.train_file + '_{0}_{1}_{2}_{3}_{4}_{5}'.format( args.bert_model, str(args.max_seq_length), str(args.doc_stride), str(args.max_query_length), str(args.max_ctx), str(args.task_name)) try: with open(cached_train_features_file, "rb") as reader: train_features = pickle.load(reader) except FileNotFoundError: train_features = data_reader.convert_examples_to_features( examples=train_examples, tokenizer=tokenizer, max_seq_length=args.max_seq_length) if args.local_rank == -1 or torch.distributed.get_rank() == 0: logger.info(" Saving train features into cached file %s", cached_train_features_file) with open(cached_train_features_file, "wb") as writer: pickle.dump(train_features, writer) num_train_steps = int( len(train_features) / args.train_batch_size / args.gradient_accumulation_steps * args.num_train_epochs) # Prepare model if args.pretrain is not None: logger.info('Load pretrained model from {}'.format(args.pretrain)) model_state_dict = torch.load(args.pretrain, map_location='cuda:0') model = initialize_model(args.bert_name, args.model_file, state_dict=model_state_dict, **model_params) else: model = initialize_model(args.bert_name, args.model_file, **model_params) # if args.fp16: # model.half() model.to(device) if args.local_rank != -1: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) model = DDP(model) elif n_gpu > 1: model = torch.nn.DataParallel(model) # Prepare optimizer param_optimizer = list(model.named_parameters()) # Remove frozen parameters param_optimizer = [n for n in param_optimizer if n[1].requires_grad] # hack to remove pooler, which is not used # thus it produce None grad that break apex param_optimizer = [n for n in param_optimizer if 'pooler' not in n[0]] no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] t_total = num_train_steps if num_train_steps is not None else -1 if args.local_rank != -1: t_total = t_total // torch.distributed.get_world_size() # if args.fp16: # try: # from apex.optimizers import FP16_Optimizer # from apex.optimizers import FusedAdam # except ImportError: # raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.") # # optimizer = FusedAdam(optimizer_grouped_parameters, # lr=args.learning_rate, # bias_correction=False, # max_grad_norm=1.0) # if args.loss_scale == 0: # optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) # else: # optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale) # warmup_linear = WarmupLinearSchedule(warmup=args.warmup_proportion, t_total=t_total) # logger.info(f"warm up linear: warmup = {warmup_linear.warmup}, t_total = {warmup_linear.t_total}.") # else: optimizer = BertAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=t_total) if args.fp16: from apex import amp model, optimizer = amp.initialize(model, optimizer, opt_level='O2') # Prepare data eval_examples = data_reader.read(input_file=args.predict_file, **read_params) eval_features = data_reader.convert_examples_to_features( examples=eval_examples, tokenizer=tokenizer, max_seq_length=args.max_seq_length) eval_tensors = data_reader.data_to_tensors(eval_features) eval_data = TensorDataset(*eval_tensors) eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.predict_batch_size) if args.do_train: if args.do_label: logger.info('Training in State Wise.') sentence_label_file = args.sentence_id_file if sentence_label_file is not None: for file in sentence_label_file: train_features = data_reader.generate_features_sentence_ids( train_features, file) else: logger.info('No sentence id supervision is found.') else: logger.info('Training in traditional way.') logger.info("***** Running training *****") logger.info(" Num orig examples = %d", len(train_examples)) logger.info(" Num split examples = %d", len(train_features)) logger.info(" Num train total optimization steps = %d", t_total) logger.info(" Batch size = %d", args.predict_batch_size) train_loss = AverageMeter() best_acc = 0.0 best_loss = 1000000 summary_writer = SummaryWriter(log_dir=args.output_dir) global_step = 0 eval_loss = AverageMeter() eval_accuracy = CategoricalAccuracy() eval_epoch = 0 train_tensors = data_reader.data_to_tensors(train_features) train_data = TensorDataset(*train_tensors) if args.local_rank == -1: train_sampler = RandomSampler(train_data) else: train_sampler = DistributedSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) for epoch in range(int(args.num_train_epochs)): logger.info(f'Running at Epoch {epoch}') # Train for step, batch in enumerate( tqdm(train_dataloader, desc="Iteration", dynamic_ncols=True)): model.train() if n_gpu == 1: batch = batch_to_device( batch, device) # multi-gpu does scattering it-self inputs = data_reader.generate_inputs( batch, train_features, model_state=ModelState.Train) model_output = model(**inputs) loss = model_output['loss'] if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: # optimizer.backward(loss) with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() if (step + 1) % args.gradient_accumulation_steps == 0: # modify learning rate with special warm up BERT uses # if args.fp16 is False, BertAdam is used and handles this automatically # if args.fp16: # lr_this_step = args.learning_rate * warmup_linear.get_lr(global_step) # for param_group in optimizer.param_groups: # param_group['lr'] = lr_this_step # summary_writer.add_scalar('lr', lr_this_step, global_step) # else: summary_writer.add_scalar('lr', optimizer.get_lr()[0], global_step) optimizer.step() optimizer.zero_grad() global_step += 1 train_loss.update(loss.item(), 1) summary_writer.add_scalar('train_loss', train_loss.avg, global_step) # logger.info(f'Train loss: {train_loss.avg}') if (step + 1) % args.per_eval_step == 0 or step == len( train_dataloader) - 1: # Evaluation model.eval() logger.info("Start evaluating") for _, eval_batch in enumerate( tqdm(eval_dataloader, desc="Evaluating", dynamic_ncols=True)): if n_gpu == 1: eval_batch = batch_to_device( eval_batch, device) # multi-gpu does scattering it-self inputs = data_reader.generate_inputs( eval_batch, eval_features, model_state=ModelState.Evaluate) with torch.no_grad(): output_dict = model(**inputs) loss, choice_logits = output_dict[ 'loss'], output_dict['choice_logits'] eval_loss.update(loss.item(), 1) eval_accuracy(choice_logits, inputs["labels"]) eval_epoch_loss = eval_loss.avg summary_writer.add_scalar('eval_loss', eval_epoch_loss, eval_epoch) eval_loss.reset() current_acc = eval_accuracy.get_metric(reset=True) summary_writer.add_scalar('eval_acc', current_acc, eval_epoch) torch.cuda.empty_cache() if args.save_all: model_to_save = model.module if hasattr( model, 'module') else model # Only save the model it-self output_model_file = os.path.join( args.output_dir, f"pytorch_model_{eval_epoch}.bin") torch.save(model_to_save.state_dict(), output_model_file) if current_acc > best_acc: best_acc = current_acc model_to_save = model.module if hasattr( model, 'module') else model # Only save the model it-self output_model_file = os.path.join( args.output_dir, "pytorch_model.bin") torch.save(model_to_save.state_dict(), output_model_file) if eval_epoch_loss < best_loss: best_loss = eval_epoch_loss model_to_save = model.module if hasattr( model, 'module') else model # Only save the model it-self output_model_file = os.path.join( args.output_dir, "pytorch_loss_model.bin") torch.save(model_to_save.state_dict(), output_model_file) logger.info( 'Eval Epoch: %d, Accuracy: %.4f (Best Accuracy: %.4f)' % (eval_epoch, current_acc, best_acc)) eval_epoch += 1 logger.info( f'Epoch {epoch}: Accuracy: {best_acc}, Train Loss: {train_loss.avg}' ) summary_writer.close() for output_model_name in ["pytorch_model.bin", "pytorch_loss_model.bin"]: # Loading trained model output_model_file = os.path.join(args.output_dir, output_model_name) model_state_dict = torch.load(output_model_file, map_location='cuda:0') model = initialize_model(args.bert_name, args.model_file, state_dict=model_state_dict, **model_params) model.to(device) # Write Yes/No predictions if args.do_predict and (args.local_rank == -1 or torch.distributed.get_rank() == 0): test_examples = data_reader.read(args.test_file) test_features = data_reader.convert_examples_to_features( test_examples, tokenizer, args.max_seq_length) test_tensors = data_reader.data_to_tensors(test_features) test_data = TensorDataset(*test_tensors) test_sampler = SequentialSampler(test_data) test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=args.predict_batch_size) logger.info("***** Running predictions *****") logger.info(" Num orig examples = %d", len(test_examples)) logger.info(" Num split examples = %d", len(test_features)) logger.info(" Batch size = %d", args.predict_batch_size) model.eval() all_results = [] test_acc = CategoricalAccuracy() logger.info("Start predicting yes/no on Dev set.") for batch in tqdm(test_dataloader, desc="Testing"): if n_gpu == 1: batch = batch_to_device( batch, device) # multi-gpu does scattering it-self inputs = data_reader.generate_inputs( batch, test_features, model_state=ModelState.Evaluate) with torch.no_grad(): batch_choice_logits = model(**inputs)['choice_logits'] test_acc(batch_choice_logits, inputs['labels']) example_indices = batch[-1] for i, example_index in enumerate(example_indices): choice_logits = batch_choice_logits[i].detach().cpu( ).tolist() test_feature = test_features[example_index.item()] unique_id = int(test_feature.unique_id) all_results.append( RawResultChoice(unique_id=unique_id, choice_logits=choice_logits)) if "loss" in output_model_name: logger.info( 'Predicting question choice on test set using model with lowest loss on validation set.' ) output_prediction_file = os.path.join(args.predict_dir, 'loss_predictions.json') else: logger.info( 'Predicting question choice on test set using model with best accuracy on validation set,' ) output_prediction_file = os.path.join(args.predict_dir, 'predictions.json') data_reader.write_predictions(test_examples, test_features, all_results, output_prediction_file) logger.info( f"Accuracy on Test set: {test_acc.get_metric(reset=True)}") # Loading trained model. if args.metric == 'accuracy': logger.info("Load model with best accuracy on validation set.") output_model_file = os.path.join(args.output_dir, "pytorch_model.bin") elif args.metric == 'loss': logger.info("Load model with lowest loss on validation set.") output_model_file = os.path.join(args.output_dir, "pytorch_loss_model.bin") else: raise RuntimeError( f"Wrong metric type for {args.metric}, which must be in ['accuracy', 'loss']." ) model_state_dict = torch.load(output_model_file, map_location='cuda:0') model = initialize_model(args.bert_name, args.model_file, state_dict=model_state_dict, **model_params) model.to(device) # Labeling sentence id. if args.do_label and (args.local_rank == -1 or torch.distributed.get_rank() == 0): f = open('debug_log.txt', 'w') def softmax(x): """Compute softmax values for each sets of scores in x.""" e_x = np.exp(x - np.max(x)) return e_x / e_x.sum() def topk(sentence_sim): """ :param sentence_sim: numpy :return: """ max_length = min(args.num_evidence, len(sentence_sim)) sorted_scores = np.array(sorted(sentence_sim, reverse=True)) scores = [] for idx in range(max_length): scores.append(np.log(softmax(sorted_scores[idx:])[0])) scores = [np.mean(scores[:(j + 1)]) for j in range(max_length)] top_k = int(np.argmax(scores) + 1) sorted_scores = sorted(enumerate(sentence_sim), key=lambda x: x[1], reverse=True) evidence_ids = [x[0] for x in sorted_scores[:top_k]] sentence = { 'sentences': evidence_ids, 'value': float(np.exp(scores[top_k - 1])) } return sentence def batch_topk(sentence_sim, sentence_mask): batch_size = sentence_sim.size(0) num_choices = sentence_sim.size(1) sentence_sim = sentence_sim.numpy() + 1e-15 sentence_mask = sentence_mask.numpy() sentence_ids = [] for b in range(batch_size): choice_sentence_ids = [ topk(_sim[:int(sum(_mask))]) for _sim, _mask in zip(sentence_sim[b], sentence_mask[b]) ] assert len(choice_sentence_ids) == num_choices sentence_ids.append(choice_sentence_ids) return sentence_ids test_examples = train_examples test_features = train_features test_tensors = data_reader.data_to_tensors(test_features) test_data = TensorDataset(*test_tensors) test_sampler = SequentialSampler(test_data) test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=args.predict_batch_size) logger.info("***** Running labeling *****") logger.info(" Num orig examples = %d", len(test_examples)) logger.info(" Num split examples = %d", len(test_features)) logger.info(" Batch size = %d", args.predict_batch_size) model.eval() all_results = [] logger.info("Start labeling.") for batch in tqdm(test_dataloader, desc="Testing"): if n_gpu == 1: batch = batch_to_device(batch, device) inputs = data_reader.generate_inputs(batch, test_features, model_state=ModelState.Test) with torch.no_grad(): output_dict = model(**inputs) batch_choice_logits, batch_sentence_logits = output_dict[ "choice_logits"], output_dict["sentence_logits"] batch_sentence_mask = output_dict["sentence_mask"] example_indices = batch[-1] # batch_beam_results = batch_choice_beam_search(batch_sentence_logits, batch_sentence_mask) batch_topk_results = batch_topk(batch_sentence_logits, batch_sentence_mask) for i, example_index in enumerate(example_indices): choice_logits = batch_choice_logits[i].detach().cpu() evidence_list = batch_topk_results[i] test_feature = test_features[example_index.item()] unique_id = int(test_feature.unique_id) all_results.append( RawOutput(unique_id=unique_id, model_output={ "choice_logits": choice_logits, "evidence_list": evidence_list })) output_prediction_file = os.path.join(args.predict_dir, 'sentence_id_file.json') data_reader.predict_sentence_ids( test_examples, test_features, all_results, output_prediction_file, weight_threshold=args.weight_threshold, only_correct=args.only_correct, label_threshold=args.label_threshold)
print('Start training!') writer = SummaryWriter(dir+'models/logs/%s/'%LOG_NAME) loss_fct = torch.nn.CrossEntropyLoss() model.train() for epoch in range(current_epoch, EPOCH): loss_train = 0 for idx, batch in enumerate(tqdm(train_loader), 1): loss, correct = train(batch, model, loss_fct, optimizer, idx, gradient_accumulation_steps) writer.add_scalar('loss_train_batch/loss_train_batch', loss, global_batch_counter_train) writer.add_scalar('accuracy_train_batch/accuracy_train_batch', correct/batch[0].shape[0], global_batch_counter_train) writer.add_scalar('epoch/lr', optimizer.get_lr()[-1], global_batch_counter_train) global_batch_counter_train += 1 loss_train += loss if (test_per_n_batch_one_epoch and (idx % test_per_n_batch_one_epoch == 0)) or (DEBUG and idx == 4 and test_per_n_batch_one_epoch): print('Start testing...') loss_test = 0 total = 0 correct = 0 y_pred = [] y_true = [] model.eval() with torch.no_grad(): for test_idx, batch in enumerate(tqdm(test_loader), 1):
def train(**kwargs): config = Config() config.update(**kwargs) print('当前设置为:\n', config) if args.use_cuda: torch.cuda.set_device(config.gpu) print('loading corpus') vocab = load_vocab(args.vocab_file) label_dic = load_vocab(config.label_file) index2label={v:k for k,v in label_dic.items()} tagset_size = len(label_dic) train_data,_ = read_corpus(args.pretrain_train_path, max_length=args.max_seq_length, label_dic=label_dic, vocab=vocab) dev_data,dev_len = read_corpus(args.pretrain_dev_path, max_length=args.max_seq_length, label_dic=label_dic, vocab=vocab) num_train_optimization_steps = int( len(train_data) / args.train_batch_size / args.gradient_accumulation_steps) * args.num_train_epochs if args.local_rank != -1: num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size() train_ids = torch.LongTensor([temp.input_id for temp in train_data]) train_masks = torch.LongTensor([temp.input_mask for temp in train_data]) train_tags = torch.LongTensor([temp.label_id for temp in train_data]) train_dataset = TensorDataset(train_ids, train_masks, train_tags) train_loader = DataLoader(train_dataset, shuffle=True, batch_size=args.train_batch_size) dev_ids = torch.LongTensor([temp.input_id for temp in dev_data]) dev_masks = torch.LongTensor([temp.input_mask for temp in dev_data]) dev_tags = torch.LongTensor([temp.label_id for temp in dev_data]) dev_dataset = TensorDataset(dev_ids, dev_masks, dev_tags) dev_loader = DataLoader(dev_dataset, shuffle=False, batch_size=args.eval_batch_size) model = BERT_LSTM_CRF(args, tagset_size, config.bert_embedding, config.rnn_hidden, config.rnn_layer, dropout_ratio=config.dropout_ratio, dropout1=config.dropout1, use_cuda=config.use_cuda) if config.use_cuda: model=model.cuda() if config.load_model: if config.flag=='submit': assert config.load_path is not None test_data, test_len = read_corpus(args.submit_test_path, max_length=args.max_seq_length, label_dic=label_dic, vocab=vocab,flag='submit') test_ids = torch.LongTensor([temp.input_id for temp in test_data]) test_masks = torch.LongTensor([temp.input_mask for temp in test_data]) test_dataset = TensorDataset(test_ids, test_masks) test_loader = DataLoader(test_dataset, shuffle=False, batch_size=args.eval_batch_size) model = load_model(model, name=None) test(model, test_loader, config, index2label, test_len) # dev(model, test_loader, None, config) if config.flag=='test': assert config.load_path is not None test_data, test_len = read_corpus(args.pretrain_test_path, max_length=args.max_seq_length, label_dic=label_dic, vocab=vocab) test_ids = torch.LongTensor([temp.input_id for temp in test_data]) test_masks = torch.LongTensor([temp.input_mask for temp in test_data]) test_tags = torch.LongTensor([temp.label_id for temp in test_data]) test_dataset = TensorDataset(test_ids, test_masks, test_tags) test_loader = DataLoader(test_dataset, shuffle=False, batch_size=args.eval_batch_size) model = load_model(model, name=None) # test(model, test_loader, config, index2label, test_len) dev(model, test_loader, 0, config, index2label, dev_len) else: # print(model) model.train() bert_param_optimizer = list(model.word_embeds.named_parameters()) lstm_param_optimizer = list(model.lstm.named_parameters()) liner_param_optimizer = list(model.liner.named_parameters()) crf_param_optimizer = list(model.crf.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [ {'params': [p for n, p in bert_param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01,'lr':config.lr}, {'params': [p for n, p in bert_param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0,'lr':config.lr}, {'params': [p for n, p in lstm_param_optimizer if not any(nd in n for nd in no_decay)],'weight_decay': 0.001, 'lr': config.lr*5}, {'params': [p for n, p in lstm_param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0,'lr': config.lr*5}, {'params': [p for n, p in liner_param_optimizer if not any(nd in n for nd in no_decay)],'weight_decay': 0.001, 'lr': config.lr * 2}, {'params': [p for n, p in liner_param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0, 'lr':config.lr * 2}, {'params': [p for n, p in crf_param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.001, 'lr': config.lr * 3}, {'params': [p for n, p in crf_param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0,'lr': config.lr * 3}, ] # print(optimizer_grouped_parameters) optimizer = BertAdam(optimizer_grouped_parameters, # lr=config.lr, warmup=args.warmup_proportion, t_total=num_train_optimization_steps) # optimizer = optimizer(model.parameters(), lr=config.lr, weight_decay=config.weight_decay) eval_f1 = 0.0 for epoch in range(config.base_epoch): print(optimizer.get_lr()) step = 0 for i, batch in enumerate(train_loader): step += 1 model.zero_grad() inputs, masks, tags = batch inputs, masks, tags = Variable(inputs), Variable(masks), Variable(tags) if config.use_cuda: inputs, masks, tags = inputs.cuda(), masks.cuda(), tags.cuda() feats = model(inputs, masks) loss = model.loss(feats, masks,tags) loss.backward() optimizer.step() if step % 50 == 0: print('step: {} | epoch: {}| loss: {}'.format(step, epoch, loss.item())) f_measure = dev(model, dev_loader, epoch, config,index2label,dev_len) if eval_f1 < f_measure: eval_f1=f_measure save_model(model,epoch,f_measure)
def train(params, datasets, student, val_iterator, cuda_device=-1, teachers=None): """ Train the model """ tb_writer = SummaryWriter() t_total = params.params.get('trainer').get('optimizer').get("t_total") no_decay = ['bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in student.named_parameters() if not any(nd in n for nd in no_decay) ], 'weight_decay': 0.01 }, { 'params': [ p for n, p in student.named_parameters() if any(nd in n for nd in no_decay) ], 'weight_decay': 0.0 }] # BertAdam already has a scheduler optimizer = BertAdam(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon, t_total=t_total) # parameters = [[n, p] for n, p in student.named_parameters() if p.requires_grad] # optimizer = Optimizer.from_params(parameters, params.get("trainer").pop("optimizer")) # Train! logger.info("***** Running training *****") logger.info(" Num examples = %d", len(list(datasets["train"]))) logger.info(" Num Epochs = %d", num_epochs) logger.info(" Instantaneous batch size per GPU = %d", params.get("iterator").get("batch_size")) logger.info(" Total optimization steps = %d", t_total) global_step = 0 total_training_loss = 0.0 logging_loss = 0.0 best_f1 = 0.0 set_seed( args) # Added here for reproductibility (even between python 2 and 3) iterator = DataIterator.from_params(params.pop("iterator")) iterator.index_with(student.vocab) for epoch in range(num_epochs): tr_loss = 0.0 student.zero_grad() batches_this_epoch = 0 epoch_start_time = time.time() logger.info("Training") logger.info("Epoch %d/%d", epoch, num_epochs - 1) # Get tqdm for the training batches train_generator = iterator(datasets["train"], num_epochs=1, shuffle=True) num_training_batches = math.ceil( iterator.get_num_batches(datasets["train"])) train_generator_tqdm = Tqdm.tqdm(train_generator, total=num_training_batches) for batch_group in train_generator_tqdm: batches_this_epoch += 1 set_name = batch_group["metadata"][0]["dataset"] if teachers != {}: assert set_name in teachers.keys() teacher = teachers.get(set_name, None) student.train() if teacher is not None: teacher.eval() batch_group = nn_util.move_to_device(batch_group, cuda_device) output_dict = student(**batch_group) start_logits_stu = output_dict["span_start_logits"] end_logits_stu = output_dict["span_end_logits"] loss = output_dict["loss"] # pure student loss, gold loss # Distillation loss if teacher is not None: with torch.no_grad(): teacher_output_dict = teacher(**batch_group) start_logits_tea = teacher_output_dict["span_start_logits"] end_logits_tea = teacher_output_dict["span_end_logits"] assert start_logits_stu.size() == start_logits_tea.size() assert end_logits_stu.size() == end_logits_tea.size() bias_weights = get_bias_weight(args.bias_type, batch_group, set_name) # confidence regularization method if args.method == "CR": loss_start = probability_scaling(start_logits_stu, bias_weights[:, 0], start_logits_tea) loss_end = probability_scaling(end_logits_stu, bias_weights[:, 1], end_logits_tea) else: # the WL method loss_start = loss_reweighting(start_logits_stu, bias_weights[:, 0], start_logits_tea) loss_end = loss_reweighting(end_logits_stu, bias_weights[:, 1], end_logits_tea) loss = loss_start + loss_end loss.backward() tr_loss += loss.item() optimizer.step() student.zero_grad() global_step += 1 metrics = training_util.get_metrics(student, tr_loss, batches_this_epoch) description = training_util.description_from_metrics( metrics) + "\n" train_generator_tqdm.set_description(description, refresh=False) training_util.get_metrics(student, tr_loss, batches_this_epoch, reset=True) # evaluate on the validation dataset with torch.no_grad(): logging.info("validation student") metrics = evaluate(student, datasets["validation"], val_iterator, cuda_device, batch_weight_key="") current_f1 = metrics["f1"] for key, value in metrics.items(): tb_writer.add_scalar("eval_{}".format(key), value, global_step) tb_writer.add_scalar("lr", optimizer.get_lr()[0], global_step) tb_writer.add_scalar("loss", (tr_loss - logging_loss) / args.logging_steps, global_step) logging_loss = tr_loss logger.info("{'Epoch %d/%d, student exact_match': %s, 'f1': %s}", epoch, num_epochs - 1, metrics["EM"], metrics["f1"]) # Save model checkpoint model_path = os.path.join(args.output_dir, "model_state_epoch_{}.th".format(epoch)) best_path = os.path.join(args.output_dir, "best.th") torch.save(student.state_dict(), model_path) if current_f1 > best_f1: torch.save(student.state_dict(), best_path) best_f1 = current_f1 logger.info("Saving model checkpoint to %s", args.output_dir) epoch_elapsed_time = time.time() - epoch_start_time logger.info("Epoch duration: %s", datetime.timedelta(seconds=epoch_elapsed_time)) total_training_loss += tr_loss student.get_metrics(reset=True) tb_writer.close() return global_step, total_training_loss / global_step
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument( "--data_dir", default=None, type=str, required=True, help= "The input data dir. Should contain the .tsv files (or other data files) for the task." ) parser.add_argument( "--bert_model", default=None, type=str, required=True, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese." ) parser.add_argument("--task_name", default=None, type=str, required=True, help="The name of the task to train.") parser.add_argument( "--output_dir", default=None, type=str, required=True, help= "The output directory where the model predictions and checkpoints will be written." ) ## Other parameters parser.add_argument( "--max_seq_length", default=128, type=int, help= "The maximum total input sequence length after WordPiece tokenization. \n" "Sequences longer than this will be truncated, and sequences shorter \n" "than this will be padded.") parser.add_argument("--do_train", default=False, action='store_true', help="Whether to run training.") parser.add_argument("--do_eval", default=False, action='store_true', help="Whether to run eval on the dev set.") parser.add_argument( "--do_lower_case", default=False, action='store_true', help="Set this flag if you are using an uncased model.") parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.") parser.add_argument("--eval_batch_size", default=8, type=int, help="Total batch size for eval.") parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument( "--warmup_proportion", default=0.1, type=float, help= "Proportion of training to perform linear learning rate warmup for. " "E.g., 0.1 = 10%% of training.") parser.add_argument("--no_cuda", default=False, action='store_true', help="Whether not to use CUDA when available") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument( '--gradient_accumulation_steps', type=int, default=1, help= "Number of updates steps to accumulate before performing a backward/update pass." ) parser.add_argument( '--fp16', default=False, action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument( '--loss_scale', type=float, default=0, help= "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.\n") parser.add_argument( '--log_path', type=str, default="./log", help="The path for saving tensorboard logs. Default is ./log") args = parser.parse_args() if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logger.info( "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}". format(device, n_gpu, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(args.gradient_accumulation_steps)) args.train_batch_size = int(args.train_batch_size / args.gradient_accumulation_steps) random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if not args.do_train and not args.do_eval: raise ValueError( "At least one of `do_train` or `do_eval` must be True.") if os.path.exists(args.output_dir) and os.listdir(args.output_dir): raise ValueError( "Output directory ({}) already exists and is not empty.".format( args.output_dir)) os.makedirs(args.output_dir, exist_ok=True) processors = { "qe": MyProcessor, } task_name = args.task_name.lower() if task_name not in processors: raise ValueError("Task not found: %s" % (task_name)) processor = processors[task_name]() model_collections = Collections() tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) train_examples = None num_train_steps = None if args.do_train: train_examples = processor.get_train_examples(args.data_dir) num_train_steps = int( len(train_examples) / args.train_batch_size / args.gradient_accumulation_steps * args.num_train_epochs) # Prepare model model = BertForSequenceClassification.from_pretrained( args.bert_model, cache_dir=PYTORCH_PRETRAINED_BERT_CACHE / 'distributed_{}'.format(args.local_rank)) model.to(device) # fine-tuning fine-tuning model # output_config_file = os.path.join(args.bert_model, CONFIG_NAME) # config = BertConfig(output_config_file) # model = BertForSequenceClassification(config) # # output_model_file = os.path.join(args.bert_model, WEIGHTS_NAME) # model_state_dict = torch.load(output_model_file) # model.load_state_dict(model_state_dict) # model.to(device) #----------------------------- if args.local_rank != -1: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) model = DDP(model) elif n_gpu > 1: model = torch.nn.DataParallel(model) # Prepare optimizer param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] t_total = num_train_steps if args.local_rank != -1: t_total = t_total // torch.distributed.get_world_size() optimizer = BertAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=t_total) # ignores_names=['classifier.weight','classifier.bias'] # # base_params = [p for n, p in model.named_parameters() if not any(nd in n for nd in ignores_names)] # ignores_params=[p for n, p in model.named_parameters() if any(nd in n for nd in ignores_names)] # # optimizer = torch.optim.Adam([{'params': base_params}, # {'params': ignores_params, 'lr': args.learning_rate * 10}], # lr=args.learning_rate) global_step = 0 if args.do_train: train_features = convert_examples_to_features(train_examples, args.max_seq_length, tokenizer) logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_examples)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_steps) all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long) all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.float) train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) if args.local_rank == -1: train_sampler = RandomSampler(train_data) else: train_sampler = DistributedSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) # Timer for computing speed timer_for_speed = Timer() timer_for_speed.tic() summary_writer = SummaryWriter(log_dir=args.log_path) is_early_stop = False disp_freq = 100 loss_valid_freq = 100 early_stop_patience = 10 bad_count = 0 nb_tr_examples, nb_tr_steps = 0, 0 for eidx in trange(int(args.num_train_epochs), desc="Epoch"): for step, batch in enumerate( tqdm(train_dataloader, desc="Iteration")): model.train() try: batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, label_ids = batch with torch.enable_grad(): loss = model(input_ids, segment_ids, input_mask, label_ids) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: optimizer.backward(loss) else: loss.backward() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 if (step + 1) % args.gradient_accumulation_steps == 0: optimizer.step() optimizer.zero_grad() global_step += 1 # model_collections.add_to_collection("train_losses", loss.item()) # summary_writer.add_scalar("train_losses", loss.item(), global_step=nb_tr_steps) # display some information if (nb_tr_steps % disp_freq == 0): lrate = list(optimizer.get_lr())[0] result = {'train_loss': loss.item(), "lrate": lrate} logger.info("***** train results *****") for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) except RuntimeError as e: if 'out of memory' in str(e): print('| WARNING: ran out of memory, skipping batch') # optimizer.zero_grad() if hasattr(torch.cuda, 'empty_cache'): torch.cuda.empty_cache() else: raise e # calculate dev loss if (nb_tr_steps % loss_valid_freq == 0): if args.do_eval and (args.local_rank == -1 or torch.distributed.get_rank() == 0): eval_examples = processor.get_dev_examples( args.data_dir) eval_features = convert_examples_to_features( eval_examples, args.max_seq_length, tokenizer) logger.info("***** Running evaluation *****") logger.info(" Num examples = %d", len(eval_examples)) logger.info(" Batch size = %d", args.eval_batch_size) all_input_ids = torch.tensor( [f.input_ids for f in eval_features], dtype=torch.long) all_input_mask = torch.tensor( [f.input_mask for f in eval_features], dtype=torch.long) all_segment_ids = torch.tensor( [f.segment_ids for f in eval_features], dtype=torch.long) all_label_ids = torch.tensor( [f.label_id for f in eval_features], dtype=torch.float) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) # Run prediction for full data eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader( eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) model.eval() eval_loss = 0 nb_eval_steps, nb_eval_examples = 0, 0 for bacth_eval in eval_dataloader: bacth_eval = tuple( t.to(device) for t in bacth_eval) input_ids, input_mask, segment_ids, label_ids = bacth_eval with torch.no_grad(): tmp_eval_loss = model(input_ids, segment_ids, input_mask, label_ids) eval_loss += tmp_eval_loss.mean().item() nb_eval_examples += input_ids.size(0) nb_eval_steps += 1 eval_loss = eval_loss / nb_eval_steps model_collections.add_to_collection( "history_losses", eval_loss) min_history_loss = np.array( model_collections.get_collection( "history_losses")).min() summary_writer.add_scalar("loss", eval_loss, global_step=nb_tr_steps) summary_writer.add_scalar("best_loss", min_history_loss, global_step=nb_tr_steps) lrate = list(optimizer.get_lr())[0] summary_writer.add_scalar("lrate", scalar_value=lrate, global_step=nb_tr_steps) best_eval_loss = min_history_loss # If model get new best valid loss # save model & early stop if eval_loss <= best_eval_loss: bad_count = 0 if is_early_stop is False: # Save a trained model # Only save the model it-self model_to_save = model.module if hasattr( model, 'module') else model output_model_file = os.path.join( args.output_dir, "pytorch_model.bin") torch.save(model_to_save.state_dict(), output_model_file) output_config_file = os.path.join( args.output_dir, CONFIG_NAME) with open(output_config_file, 'w') as f: f.write( model_to_save.config.to_json_string()) else: bad_count += 1 # At least one epoch should be traversed if bad_count >= early_stop_patience and eidx > 0: is_early_stop = True logger.info("Early Stop!") summary_writer.add_scalar("bad_count", bad_count, nb_tr_steps) logger.info("{0} Loss: {1:.4f} patience: {2}".format( nb_tr_steps, eval_loss, bad_count)) if is_early_stop == True: break
def main(args): logging = config.get_logging(args.log_name) logging.info("##"*20) logging.info("##"*20) logging.info("##"*20) logging.info(args) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") n_gpu = torch.cuda.device_count() logging.info("| question first :: {}".format(args.question_first)) logging.info("| gpu count : {}".format(n_gpu)) logging.info("| train batch size in each gpu : {}".format(args.train_batch_size)) logging.info("| biuid tokenizer and model in : {}".format(args.pre_dir)) random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) tokenizer = BertTokenizer.build_tokenizer(args) train_data_iter = MSmarco_iterator(args, tokenizer, batch_size=args.train_batch_size, world_size=n_gpu, accumulation_steps=args.gradient_accumulation_steps, name="msmarco_train.pk") dev_data_iter = MSmarco_iterator(args, tokenizer, batch_size=args.valid_batch_size, world_size=n_gpu, name="msmarco_dev.pk") data_size = len(train_data_iter) gradient_accumulation_steps = args.gradient_accumulation_steps num_train_steps = args.num_train_epochs*data_size//gradient_accumulation_steps # logging.info("| load dataset {}".format(data_size)) logging.info("| train data size {}".format(len(train_data_iter)*n_gpu*args.train_batch_size)) logging.info("| dev data size {}".format(len(dev_data_iter)*n_gpu*args.valid_batch_size)) logging.info("| train batch data size {}".format(len(train_data_iter))) logging.info("| dev batch data size {}".format(len(dev_data_iter))) logging.info("| update in each train data {}".format(data_size//gradient_accumulation_steps)) logging.info("| total update {}".format(num_train_steps)) # num_train_steps = (96032//2//2)+(data_size-96032)//2 model = MSmarco.build_model(args) model.to(device) if n_gpu > 1: model = torch.nn.DataParallel(model) param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'gamma', 'beta', 'layer_norm'] optimizer_grouped_parameters = [ {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay_rate': 0.01}, {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay_rate': 0.0} ] optimizer = BertAdam(optimizer_grouped_parameters, lr=args.lr, warmup=args.warmup_proportion, t_total=num_train_steps) logging.info("| init lr is {}".format(optimizer.get_lr())) global_update = 0 for epochs in range(args.num_train_epochs): total_loss = 0 merge_batch = [] # count = 0 for step, batch in enumerate(tqdm(train_data_iter, desc="Train Iteration")): model.train() # if step < 96032: # merge_batch.append(batch) # if len(merge_batch) == 2: # batch = merger_tensor(merge_batch) # merge_batch = [] # else: # continue if n_gpu==1: for key in batch.keys(): batch[key]=batch[key].to(device) loss = model(**batch) # count += 1 # pdb.set_trace() if n_gpu > 1: loss = loss.mean() if args.gradient_accumulation_steps > 1: loss = loss/args.gradient_accumulation_steps loss.backward() if (step+1) % args.gradient_accumulation_steps == 0: optimizer.step() model.zero_grad() global_update += 1 if global_update % args.validate_updates==0: validation(args, model, dev_data_iter, n_gpu, epochs, global_update, logging) if (step+1) % args.loss_interval==0: logging.info("TRAIN ::Epoch {} updates {}, train loss {}".format(epochs, global_update, loss.item())) # save_checkpoint(args, model, epochs) validation(args, model, dev_data_iter, n_gpu, epochs, global_update, logging)