def __call__(self, model, device, args): log = self._logger # Prepare optimizer and schedule (linear warmup and decay) optimization_steps = (len(self.train_dataloader) * args.epochs) // args.gradient_accumulation_steps no_decay = ['bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [ {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': args.weight_decay}, {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) scheduler = WarmupLinearSchedule(optimizer, warmup_steps=args.warmup_steps, t_total=optimization_steps) # Train log.info(f"Training Started with parameters {args}") model.zero_grad() global_step = 1 for epoch in trange(args.epochs, desc="Epoch"): for step, batch in enumerate(tqdm(self.train_dataloader)): model.train() batch = tuple(t.to(device) for t in batch) # Send data to target device model_input = {'input_ids': batch[0], # word ids 'attention_mask': batch[1], # input mask 'token_type_ids': batch[2], # segment ids 'labels': batch[3]} # labels outputs = model(**model_input) train_loss = outputs[0] if args.gradient_accumulation_steps > 1: train_loss = train_loss / args.gradient_accumulation_steps train_loss.backward() # Accumulates the gradient before optimize the model if (step + 1) % args.gradient_accumulation_steps == 0: torch.nn.utils.clip_grad_norm_(model.parameters(), args.clip_norm) # grad clip optimizer.step() scheduler.step() model.zero_grad() # Steps necessary to run the trained model into validation data set if (step + 1) % args.eval_steps == 0 and not args.eval_per_epoch: self.evaluate_on_val_set(epoch, global_step, optimization_steps, model, device, scheduler, args) global_step += 1 if args.eval_per_epoch: self.evaluate_on_val_set(epoch, global_step, optimization_steps, model, device, scheduler, args)
def main(): args = parse_args() # Devices if args.local_rank == -1: device = torch.device("cuda" if torch.cuda.is_available() else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 torch.distributed.init_process_group(backend="nccl") # Init distributed backend for sychronizing nodes/GPUs default_gpu = False if dist.is_available() and args.local_rank != -1: rank = dist.get_rank() if rank == 0: default_gpu = True else: default_gpu = True logger.info(f"device: {device} n_gpu: {n_gpu}, distributed training: {bool(args.local_rank != -1)}") # Load config config = BertConfig.from_json_file(args.config_file) # Output dirs timestamp = args.config_file.split("/")[1].split(".")[0] save_path = os.path.join(args.output_dir, timestamp) if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) if default_gpu: if not os.path.exists(save_path): os.makedirs(save_path) # save all the hidden parameters. with open(os.path.join(save_path, "command.txt"), "w") as f: print(args, file=f) # Python 3.x print("\n", file=f) print(config, file=f) cache = 5000 args.train_batch_size = args.train_batch_size // args.grad_acc_steps if dist.is_available() and args.local_rank != -1: num_replicas = dist.get_world_size() args.train_batch_size = args.train_batch_size // num_replicas args.num_workers = args.num_workers // num_replicas cache = cache // num_replicas # Seed random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) # Datasets tokenizer = AutoTokenizer.from_pretrained(config.bert_model, do_lower_case=config.do_lower_case) train_dataset = ConceptCapLoaderTrain(args.annotations_path, args.features_path, tokenizer, seq_len=args.max_seq_length, batch_size=args.train_batch_size, num_workers=args.num_workers, local_rank=args.local_rank, objective=args.objective, cache=cache, add_global_imgfeat=config.add_global_imgfeat, num_locs=config.num_locs) valid_dataset = ConceptCapLoaderVal(args.annotations_path, args.features_path, tokenizer, seq_len=args.max_seq_length, batch_size=args.train_batch_size, num_workers=2, objective=args.objective, add_global_imgfeat=config.add_global_imgfeat, num_locs=config.num_locs) # Task details task_names = ["Conceptual_Caption"] task_ids = ["TASK0"] task2num_iters = {"TASK0": train_dataset.num_dataset / args.train_batch_size} # Logging logdir = os.path.join(args.logdir, timestamp) if default_gpu: tb_logger = tbLogger(logdir, save_path, task_names, task_ids, task2num_iters, args.grad_acc_steps) else: tb_logger = None # Model if args.from_pretrained: type_vocab_size = config.type_vocab_size config.type_vocab_size = 2 model = BertForVLPreTraining.from_pretrained(args.from_pretrained, config=config, default_gpu=default_gpu, from_hf=True) # Resize type embeddings model.bert.embeddings.token_type_embeddings = \ model._get_resized_embeddings(model.bert.embeddings.token_type_embeddings, type_vocab_size) config.type_vocab_size = type_vocab_size else: model = BertForVLPreTraining(config) # Optimization details freeze_layers(model) no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"] bert_weight_name = json.load(open("config/" + args.from_pretrained + "_weight_name.json", "r")) if not args.from_pretrained: param_optimizer = list(model.named_parameters()) optimizer_grouped_parameters = [ {"params": [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], "weight_decay": args.weight_decay}, {"params": [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], "weight_decay": 0.0}, ] else: optimizer_grouped_parameters = [] for key, value in dict(model.named_parameters()).items(): if value.requires_grad: if key[12:] in bert_weight_name: lr = args.learning_rate * 0.1 else: lr = args.learning_rate if any(nd in key for nd in no_decay): optimizer_grouped_parameters += [{"params": [value], "lr": lr, "weight_decay": 0.0}] if not any(nd in key for nd in no_decay): optimizer_grouped_parameters += [{"params": [value], "lr": lr, "weight_decay": args.weight_decay}] if default_gpu: print(len(list(model.named_parameters())), len(optimizer_grouped_parameters)) optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon, betas=args.adam_betas) num_train_optimization_steps = int( train_dataset.num_dataset / args.train_batch_size / args.grad_acc_steps ) * args.num_train_epochs warmup_steps = args.warmup_steps or args.warmup_proportion * num_train_optimization_steps scheduler = WarmupLinearSchedule(optimizer, warmup_steps=warmup_steps, t_total=num_train_optimization_steps) # Resume training start_iter_id, global_step, start_epoch, tb_logger, _ = \ resume(args.resume_file, model, optimizer, scheduler, tb_logger) # Move to GPU(s) model.cuda() for state in optimizer.state.values(): for k, v in state.items(): if torch.is_tensor(v): state[k] = v.cuda() if args.local_rank != -1: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) model = DDP(model) elif n_gpu > 1: model = torch.nn.DataParallel(model) # Save starting model save(save_path, logger, -1, model, optimizer, scheduler, global_step, tb_logger, default_gpu, -1) # Print summary if default_gpu: summary_parameters(model, logger) logger.info("***** Running training *****") logger.info(" Num examples = %d", train_dataset.num_dataset) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_optimization_steps) # Train for epoch_id in range(start_epoch, int(args.num_train_epochs)): model.train() for step, batch in enumerate(train_dataset): iter_id = start_iter_id + step + (epoch_id * len(train_dataset)) batch = tuple(t.cuda(device=device, non_blocking=True) for t in batch[:-1]) input_ids, input_mask, segment_ids, lm_label_ids, is_match, \ image_feat, image_loc, image_cls, obj_labels, obj_confs, \ attr_labels, attr_confs, image_attrs, image_label, image_mask = batch if args.objective == 1: # Ignore labels (setting them to -1) for mismatched caption-image pairs image_label = image_label * (is_match == 0).long().unsqueeze(1) image_label[image_label == 0] = -1 lm_label_ids = lm_label_ids * (is_match == 0).long().unsqueeze(1) lm_label_ids[lm_label_ids == 0] = -1 masked_loss_t, masked_loss_v, pair_match_loss = model(input_ids, image_feat, image_loc, segment_ids, input_mask, image_mask, lm_label_ids, image_label, image_cls, obj_labels, obj_confs, attr_labels, attr_confs, image_attrs, is_match) if args.objective == 2: pair_match_loss = pair_match_loss * 0 loss = masked_loss_t + masked_loss_v + pair_match_loss if n_gpu > 1: loss = loss.mean() masked_loss_t = masked_loss_t.mean() masked_loss_v = masked_loss_v.mean() pair_match_loss = pair_match_loss.mean() if args.grad_acc_steps > 1: loss = loss / args.grad_acc_steps loss.backward() if (step + 1) % args.grad_acc_steps == 0: # Clip gradient if args.clip_grad_norm > 0: torch.nn.utils.clip_grad_norm_(model.parameters(), args.clip_grad_norm) optimizer.step() scheduler.step() optimizer.zero_grad() global_step += 1 if default_gpu: tb_logger.step_train_CC(epoch_id, iter_id, float(masked_loss_t), float(masked_loss_v), float(pair_match_loss), optimizer.param_groups[0]["lr"], "TASK0", "train") if (step % (20 * args.grad_acc_steps) == 0) and step != 0 and default_gpu: tb_logger.showLossTrainCC() # Do the evaluation torch.set_grad_enabled(False) numBatches = len(valid_dataset) model.eval() for step, batch in enumerate(valid_dataset): batch = tuple(t.cuda(device=device, non_blocking=True) for t in batch[:-1]) input_ids, input_mask, segment_ids, lm_label_ids, is_match, \ image_feat, image_loc, image_cls, obj_labels, obj_confs, \ attr_labels, attr_confs, image_attrs, image_label, image_mask = batch batch_size = input_ids.size(0) masked_loss_t, masked_loss_v, pair_match_loss = model(input_ids, image_feat, image_loc, segment_ids, input_mask, image_mask, lm_label_ids, image_label, image_cls, obj_labels, obj_confs, attr_labels, attr_confs, image_attrs, is_match) loss = masked_loss_t + masked_loss_v + pair_match_loss if n_gpu > 1: loss = loss.mean() masked_loss_t = masked_loss_t.mean() masked_loss_v = masked_loss_v.mean() pair_match_loss = pair_match_loss.mean() if default_gpu: tb_logger.step_val_CC(epoch_id, iter_id, float(masked_loss_t), float(masked_loss_v), float(pair_match_loss), "TASK0", batch_size, "val") sys.stdout.write("%d / %d \r" % (step, numBatches)) sys.stdout.flush() if default_gpu: tb_logger.showLossValCC() torch.set_grad_enabled(True) save(save_path, logger, epoch_id, model, optimizer, scheduler, global_step, tb_logger, default_gpu, loss) if default_gpu: tb_logger.txt_close()
def train(args, train_dataset, model, tokenizer): """ Train the model """ if args.local_rank in [-1, 0]: tb_writer = SummaryWriter() args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu) train_sampler = RandomSampler( train_dataset) if args.local_rank == -1 else DistributedSampler( train_dataset) train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size) if args.max_steps > 0: t_total = args.max_steps args.num_train_epochs = args.max_steps // ( len(train_dataloader) // args.gradient_accumulation_steps) + 1 else: t_total = len( train_dataloader ) // args.gradient_accumulation_steps * args.num_train_epochs # Prepare optimizer and schedule (linear warmup and decay) no_decay = ['bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) ], 'weight_decay': args.weight_decay }, { 'params': [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) ], 'weight_decay': 0.0 }] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) scheduler = WarmupLinearSchedule(optimizer, warmup_steps=args.warmup_steps, t_total=t_total) if args.fp16: try: from apex import amp except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use fp16 training." ) model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level) # Distributed training (should be after apex fp16 initialization) if args.local_rank != -1: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True) # Train! logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_dataset)) logger.info(" Num Epochs = %d", args.num_train_epochs) logger.info(" Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size) logger.info( " Total train batch size (w. parallel, distributed & accumulation) = %d", args.train_batch_size * args.gradient_accumulation_steps * (torch.distributed.get_world_size() if args.local_rank != -1 else 1)) logger.info(" Gradient Accumulation steps = %d", args.gradient_accumulation_steps) logger.info(" Total optimization steps = %d", t_total) global_step = 0 tr_loss, logging_loss = 0.0, 0.0 model.zero_grad() train_iterator = trange(int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0]) set_seed( args) # Added here for reproductibility (even between python 2 and 3) for _ in train_iterator: epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0]) for step, batch in enumerate(epoch_iterator): model.train() batch = tuple(t.to(args.device) for t in batch) inputs = { 'input_ids': batch[0], 'attention_mask': batch[1], 'token_type_ids': batch[2], 'labels': batch[3] } outputs = model(**inputs) loss = outputs[ 0] # model outputs are always tuple in pytorch-transformers (see doc) if args.n_gpu > 1: loss = loss.mean( ) # mean() to average on multi-gpu parallel training if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm) else: loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) tr_loss += loss.item() if (step + 1) % args.gradient_accumulation_steps == 0: scheduler.step() # Update learning rate schedule optimizer.step() model.zero_grad() global_step += 1 if args.local_rank in [ -1, 0 ] and args.logging_steps > 0 and global_step % args.logging_steps == 0: # Log metrics if args.local_rank == -1 and args.evaluate_during_training: # Only evaluate when single GPU otherwise metrics may not average well results = evaluate(args, model, tokenizer) for key, value in results.items(): tb_writer.add_scalar('eval_{}'.format(key), value, global_step) tb_writer.add_scalar('lr', scheduler.get_lr()[0], global_step) tb_writer.add_scalar('loss', (tr_loss - logging_loss) / args.logging_steps, global_step) logging_loss = tr_loss if args.local_rank in [ -1, 0 ] and args.save_steps > 0 and global_step % args.save_steps == 0: # Save model checkpoint output_dir = os.path.join( args.output_dir, 'checkpoint-{}'.format(global_step)) if not os.path.exists(output_dir): os.makedirs(output_dir) model_to_save = model.module if hasattr( model, 'module' ) else model # Take care of distributed/parallel training model_to_save.save_pretrained(output_dir) torch.save(args, os.path.join(output_dir, 'training_args.bin')) logger.info("Saving model checkpoint to %s", output_dir) if args.max_steps > 0 and global_step > args.max_steps: epoch_iterator.close() break if args.max_steps > 0 and global_step > args.max_steps: train_iterator.close() break if args.local_rank in [-1, 0]: tb_writer.close() return global_step, tr_loss / global_step
def train(train_task_name, model, tokenizer): set_seed(42) # for reproductibility # prepare training dataset train_features = convert_to_input_features_helper(train_examples, tokenizer, use_multiprocessing) all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long) all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.long) # !!!! no minus 1 train_dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) # total batch size train_batch_size = per_gpu_train_batch_size * max(1, n_gpus) train_sampler = SequentialSampler(train_dataset) # was random sampler train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=train_batch_size) if max_steps > 0: t_total = max_steps num_trian_epochs = max_steps // len( train_dataloader) // gradient_accumulation_steps + 1 else: t_total = len( train_dataloader) // gradient_accumulation_steps * num_train_epochs # prepare optimizer and schedule (linear warmup and decay) warmup_steps = int(t_total * warmup_proportion) no_decay = ['bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) ], 'weight_decay': 0.01 }, { 'params': [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) ], 'weight_decay': 0.0 }] optimizer = AdamW(optimizer_grouped_parameters, lr=learning_rate, eps=1e-8) scheduler = WarmupLinearSchedule(optimizer, warmup_steps=warmup_steps, t_total=t_total) if n_gpus > 1: print('*********** using multi gpu! ************') model = torch.nn.DataParallel(model) logger.info("***** Running %s *****", 'training') logger.info(" Num examples = %d", len(train_dataloader)) logger.info(" Batch size per gpu = %d", per_gpu_train_batch_size) logger.info(" Total batch size = %d", train_batch_size) logger.info(" Num steps = %d", t_total) # visualization # train max_grad_norm = 1 epoch = 0 # for visualization loss-epoch global_step = 0 tr_loss, loging_loss = 0.0, 0.0 model.zero_grad() train_iterator = trange(int(num_train_epochs), desc='Epoch') saved_loss = [] for _ in train_iterator: epoch_iterator = tqdm(train_dataloader, desc="Iteration") for step, batch in enumerate(epoch_iterator): model.train() batch = tuple(t.to(device) for t in batch) inputs = { 'input_ids': batch[0], 'attention_mask': batch[1], 'token_type_ids': batch[2], 'labels': batch[3] } outputs = model(**inputs) # unpack dict loss, logits = outputs[:2] # model outputs are in tuple saved_loss.append(loss.detach().cpu().numpy().item()) if gradient_accumulation_steps > 1: loss = loss / gradient_accumulation_steps # print("\r%f" % loss, end='') # delete this loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm) tr_loss += loss.item() if (step + 1) % gradient_accumulation_steps == 0: optimizer.step() scheduler.step() # call optimizer before scheduler model.zero_grad() global_step += 1 if max_steps > 0 and global_step > max_steps: epoch_iterator.close() break epoch += 1 # save model at each epoch output_model_dir = os.path.join(cache_dir, 'epoch_{}'.format(epoch)) if not os.path.exists(output_model_dir): os.makedirs(output_model_dir) model_to_save = model.module if hasattr( model, 'module') else model # take care of distributed/parallel training model_to_save.save_pretrained(output_model_dir) tokenizer.save_pretrained(output_model_dir) torch.save(stats, os.path.join(output_dir, 'training_args.bin')) logger.info('Saving model at epoch %d to %s' % (epoch, output_model_dir)) # evaluation using saved model if max_steps > 0 and global_step > max_steps: train_iterator.close() break # draw and save loss-step graph save_loss(saved_loss, global_step)
def main(): args = parse_args() # Devices if args.local_rank == -1: device = torch.device("cuda" if torch.cuda.is_available() else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 torch.distributed.init_process_group(backend="nccl") default_gpu = False if dist.is_available() and args.local_rank != -1: rank = dist.get_rank() if rank == 0: default_gpu = True else: default_gpu = True logger.info(f"device: {device} n_gpu: {n_gpu}, distributed training: {bool(args.local_rank != -1)}") # Load config config = BertConfig.from_json_file(args.config_file) # Load task config with open(args.tasks_config_file, "r") as f: task_cfg = edict(yaml.safe_load(f)) task_id = args.task.strip() task = "TASK" + task_id task_name = task_cfg[task]["name"] base_lr = task_cfg[task]["lr"] if task_cfg[task].get("fusion_method", None): # VL-BERT pooling for VQA config.fusion_method = task_cfg[task]["fusion_method"] # Output dirs if args.save_name: prefix = "-" + args.save_name else: prefix = "" timestamp = (task_name + "_" + args.config_file.split("/")[1].split(".")[0] + prefix) save_path = os.path.join(args.output_dir, timestamp) if default_gpu: if not os.path.exists(save_path): os.makedirs(save_path) # save all the hidden parameters. with open(os.path.join(save_path, "command.txt"), "w") as f: print(args, file=f) # Python 3.x print("\n", file=f) print(config, file=f) # Seed random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) # Dataset batch_size, task2num_iters, dset_train, dset_val, dl_train, dl_val = LoadDataset(args, config, task_cfg, args.task) # Logging logdir = os.path.join(args.logdir, timestamp) tb_logger = tbLogger(logdir, save_path, [task_name], [task], task2num_iters, args.grad_acc_steps) if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) # Model if "roberta" in args.bert_model: config.model = "roberta" model = BertForVLTasks.from_pretrained(args.from_pretrained, config=config, task_cfg=task_cfg, task_ids=[task]) if task_cfg[task].get("embed_clf", None): logger.info('Initializing classifier weight for %s from pretrained word embeddings...' % task) answers_word_embed = [] for k, v in model.state_dict().items(): if 'bert.embeddings.word_embeddings.weight' in k: word_embeddings = v.detach().clone() break for answer, label in sorted(dset_train.ans2label.items()): a_tokens = dset_train._tokenizer.tokenize(answer) a_ids = dset_train._tokenizer.convert_tokens_to_ids(a_tokens) if len(a_ids): a_word_embed = (torch.stack([word_embeddings[a_id] for a_id in a_ids], dim=0)).mean(dim=0) else: a_tokens = dset_train._tokenizer.tokenize("<unk>") a_id = dset_train._tokenizer.convert_tokens_to_ids(a_tokens)[0] a_word_embed = word_embeddings[a_id] answers_word_embed.append(a_word_embed) answers_word_embed_tensor = torch.stack(answers_word_embed, dim=0) for name, module in model.named_modules(): if name.endswith('clfs_dict.%s.logit_fc.3' % task): module.weight.data = answers_word_embed_tensor.to(device=module.weight.data.device) # Optimization details freeze_layers(model) criterion = LoadLoss(task_cfg, args.task) no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"] optimizer_grouped_parameters = [] for key, value in dict(model.named_parameters()).items(): if value.requires_grad: if "vil_" in key: lr = 1e-4 else: lr = base_lr if any(nd in key for nd in no_decay): optimizer_grouped_parameters += [{"params": [value], "lr": lr, "weight_decay": 0.0}] if not any(nd in key for nd in no_decay): optimizer_grouped_parameters += [{"params": [value], "lr": lr, "weight_decay": args.weight_decay}] if default_gpu: print(len(list(model.named_parameters())), len(optimizer_grouped_parameters)) if args.optim == "AdamW": optimizer = AdamW(optimizer_grouped_parameters, lr=base_lr, eps=args.adam_epsilon, betas=args.adam_betas, correct_bias=args.adam_correct_bias) elif args.optim == "RAdam": optimizer = RAdam(optimizer_grouped_parameters, lr=base_lr) num_train_optim_steps = (task2num_iters[task] * args.num_train_epochs // args.grad_acc_steps) warmup_steps = args.warmup_steps or args.warmup_proportion * num_train_optim_steps if args.lr_scheduler == "warmup_linear": scheduler = WarmupLinearSchedule(optimizer, warmup_steps=warmup_steps, t_total=num_train_optim_steps) else: scheduler = WarmupConstantSchedule(optimizer, warmup_steps=warmup_steps) # Resume training start_iter_id, global_step, start_epoch, tb_logger, max_score = \ resume(args.resume_file, model, optimizer, scheduler, tb_logger) # Move to GPU(s) model.to(device) for state in optimizer.state.values(): for k, v in state.items(): if torch.is_tensor(v): state[k] = v.cuda() if args.local_rank != -1: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) model = DDP(model, delay_allreduce=True) elif n_gpu > 1: model = torch.nn.DataParallel(model) # Save starting model save(save_path, logger, -1, model, optimizer, scheduler, global_step, tb_logger, default_gpu) # Print summary if default_gpu: summary_parameters(model, logger) print("***** Running training *****") print(" Num Iters: ", task2num_iters[task]) print(" Batch size: ", batch_size) print(" Num steps: %d" % num_train_optim_steps) # Train for epoch_id in tqdm(range(start_epoch, args.num_train_epochs), desc="Epoch"): model.train() for step, batch in enumerate(dl_train): iter_id = start_iter_id + step + (epoch_id * len(dl_train)) loss, score = ForwardModelsTrain(config, task_cfg, device, task, batch, model, criterion) if args.grad_acc_steps > 1: loss = loss / args.grad_acc_steps loss.backward() if (step + 1) % args.grad_acc_steps == 0: # Clip gradient if args.clip_grad_norm > 0: torch.nn.utils.clip_grad_norm_(model.parameters(), args.clip_grad_norm) optimizer.step() if global_step < warmup_steps or args.lr_scheduler == "warmup_linear": scheduler.step() model.zero_grad() global_step += 1 if default_gpu: tb_logger.step_train(epoch_id, iter_id, float(loss), float(score), optimizer.param_groups[0]["lr"], task, "train") if (step % (20 * args.grad_acc_steps) == 0) and step != 0 and default_gpu: tb_logger.showLossTrain() # Decide whether to evaluate task if iter_id != 0 and iter_id % task2num_iters[task] == 0: score = evaluate(config, dl_val, task_cfg, device, task, model, criterion, epoch_id, default_gpu, tb_logger) if score > max_score: max_score = score save(save_path, logger, epoch_id, model, optimizer, scheduler, global_step, tb_logger, default_gpu, max_score) save(save_path, logger, epoch_id, model, optimizer, scheduler, global_step, tb_logger, default_gpu, max_score) tb_logger.txt_close()
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument("--training_data_path", default=None, type=str, required=True, help="The training data path") parser.add_argument("--validation_data_path", default=None, type=str, required=True, help="The validation data path") parser.add_argument( "--mcq_model", default=None, type=str, required=True, help="choose one from the list: bert-mcq-parallel-max, " "bert-mcq_parallel-weighted-sum, bert-mcq-concat, mac-bert, or add roberta instead of bert" ) parser.add_argument( "--bert_model", default=None, type=str, required=True, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, " "bert-base-multilingual-cased, bert-base-chinese, roberta-base, roberta-large" ) parser.add_argument( "--output_dir", default=None, type=str, required=True, help="The output directory where the model checkpoints will be written." ) ## Other parameters parser.add_argument( "--max_seq_length", default=128, type=int, help= "The maximum total input sequence length after WordPiece tokenization. \n" "Sequences longer than this will be truncated, and sequences shorter \n" "than this will be padded.") parser.add_argument("--do_train", action='store_true', help="Whether to run training.") parser.add_argument("--do_eval", action='store_true', help="Whether to run eval on the dev set.") parser.add_argument( "--do_lower_case", action='store_true', help="Set this flag if you are using an uncased model.") parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.") parser.add_argument("--eval_batch_size", default=32, type=int, help="Total batch size for eval.") parser.add_argument("--learning_rate", default=2e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument( "--warmup_proportion", default=0.1, type=float, help= "Proportion of training to perform linear learning rate warmup for. " "E.g., 0.1 = 10%% of training.") parser.add_argument("--no_cuda", action='store_true', help="Whether not to use CUDA when available") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument( '--gradient_accumulation_steps', type=int, default=1, help= "Number of updates steps to accumulate before performing a backward/update pass." ) parser.add_argument( '--fp16', action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument( '--loss_scale', type=float, default=0, help= "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.\n") parser.add_argument("--max_grad_norm", default=None, type=float, help="Max gradient norm.") parser.add_argument( '--fp16_opt_level', type=str, default='O1', help= "For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']." "See details at https://nvidia.github.io/apex/amp.html") parser.add_argument("--weight_decay", default=0.0, type=float, help="Weight deay if we apply some.") parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.") parser.add_argument("--warmup_steps", default=0, type=int, help="Linear warmup over warmup_steps.") parser.add_argument("--dropout", default=0.0, type=float, help="dropout") parser.add_argument( "--eval_freq", default=0, type=int, help="Evaluation steps frequency. Default is at the end of each epoch. " "You can also increase the frequency") parser.add_argument( '--tie_weights_weighted_sum', action='store_true', help="Whether to tie the weights for the weighted sum model") parser.add_argument('--max_number_premises', type=int, default=None, help="Number of premise sentences to use at max") parser.add_argument('--num_labels', type=int, default=3, help="Number of labels") parser.add_argument('--overwrite_output_dir', action='store_true', help="Overwrite the content of the output directory") parser.add_argument('--with_score', action='store_true', help="Knowledge with score is provided") args = parser.parse_args() if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logger.info( "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}". format(device, n_gpu, bool(args.local_rank != -1), args.fp16)) # true batch size args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps if args.gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(args.gradient_accumulation_steps)) if not args.do_train and not args.do_eval: raise ValueError( "At least one of `do_train` or `do_eval` must be True.") # if os.path.exists(args.output_dir) and os.listdir(args.output_dir): # raise ValueError("Output directory ({}) already exists and is not empty.".format(args.output_dir)) if os.path.exists(args.output_dir) and os.listdir( args.output_dir) and not args.overwrite_output_dir: raise ValueError( "Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome." .format(args.output_dir)) if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) with open(os.path.join(args.output_dir, "mcq_inputs.json"), 'w') as f: json.dump(vars(args), f, indent=2) stdout_handler = prepare_global_logging(args.output_dir, False) random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if "roberta" in args.bert_model: tokenizer = RobertaTokenizer.from_pretrained( "roberta-large", do_lower_case=args.do_lower_case) logger.info("Type of Tokenizer : ROBERTA") else: tokenizer = BertTokenizer.from_pretrained( args.bert_model, do_lower_case=args.do_lower_case) logger.info("Type of Tokenizer : BERT") data_reader = None if args.mcq_model == 'bert-mcq-parallel-max': model = BertMCQParallel.from_pretrained( args.bert_model, cache_dir=os.path.join(str(PYTORCH_PRETRAINED_BERT_CACHE), 'distributed_{}'.format(args.local_rank))) data_reader = BertMCQParallelReader() elif args.mcq_model == 'bert-mcq-concat': model = BertMCQConcat.from_pretrained( args.bert_model, cache_dir=os.path.join(str(PYTORCH_PRETRAINED_BERT_CACHE), 'distributed_{}'.format(args.local_rank))) data_reader = BertMCQConcatReader() elif args.mcq_model == 'bert-mcq-weighted-sum': model = BertMCQWeightedSum.from_pretrained( args.bert_model, tie_weights=args.tie_weights_weighted_sum, cache_dir=os.path.join(str(PYTORCH_PRETRAINED_BERT_CACHE), 'distributed_{}'.format(args.local_rank))) data_reader = BertMCQParallelReader() elif args.mcq_model == 'bert-mcq-simple-sum': model = BertMCQSimpleSum.from_pretrained( args.bert_model, cache_dir=os.path.join(str(PYTORCH_PRETRAINED_BERT_CACHE), 'distributed_{}'.format(args.local_rank))) data_reader = BertMCQParallelReader() elif args.mcq_model == 'bert-mcq-mac': model = BertMCQMAC.from_pretrained( args.bert_model, cache_dir=os.path.join(str(PYTORCH_PRETRAINED_BERT_CACHE), 'distributed_{}'.format(args.local_rank))) data_reader = BertMCQParallelReader() elif args.mcq_model == 'roberta-mcq-parallel-max': model = RoBertaMCQParallel.from_pretrained( args.bert_model, cache_dir=os.path.join(str(PYTORCH_PRETRAINED_BERT_CACHE), 'distributed_{}'.format(args.local_rank))) data_reader = RoBertaMCQParallelReader() elif args.mcq_model == 'roberta-mcq-concat': model = RoBertaMCQConcat.from_pretrained( args.bert_model, cache_dir=os.path.join(str(PYTORCH_PRETRAINED_BERT_CACHE), 'distributed_{}'.format(args.local_rank))) data_reader = RoBertaMCQConcatReader() elif args.mcq_model == 'roberta-mcq-weighted-sum': model = RoBertaMCQWeightedSum.from_pretrained( args.bert_model, tie_weights=args.tie_weights_weighted_sum, cache_dir=os.path.join(str(PYTORCH_PRETRAINED_BERT_CACHE), 'distributed_{}'.format(args.local_rank))) data_reader = RoBertaMCQParallelReader() elif args.mcq_model == 'roberta-mcq-ws-score': model = RoBertaMCQWeightedSumScore.from_pretrained( args.bert_model, tie_weights=args.tie_weights_weighted_sum, cache_dir=os.path.join(str(PYTORCH_PRETRAINED_BERT_CACHE), 'distributed_{}'.format(args.local_rank))) data_reader = RoBertaMCQParallelScoreReader() elif args.mcq_model == 'roberta-mcq-simple-sum': model = RoBertaMCQSimpleSum.from_pretrained( args.bert_model, cache_dir=os.path.join(str(PYTORCH_PRETRAINED_BERT_CACHE), 'distributed_{}'.format(args.local_rank))) data_reader = RoBertaMCQParallelReader() elif args.mcq_model == 'roberta-mcq-ss-score': model = RoBertaMCQSimpleSumScore.from_pretrained( args.bert_model, cache_dir=os.path.join(str(PYTORCH_PRETRAINED_BERT_CACHE), 'distributed_{}'.format(args.local_rank))) data_reader = RoBertaMCQParallelScoreReader() elif args.mcq_model == 'roberta-mcq-mac': model = RoBertaMCQMAC.from_pretrained( args.bert_model, cache_dir=os.path.join(str(PYTORCH_PRETRAINED_BERT_CACHE), 'distributed_{}'.format(args.local_rank))) data_reader = RoBertaMCQParallelReader() elif args.mcq_model == 'roberta-mcq-conv3d': model = RoBertaMCQConv3d.from_pretrained( args.bert_model, cache_dir=os.path.join(str(PYTORCH_PRETRAINED_BERT_CACHE), 'distributed_{}'.format(args.local_rank))) data_reader = RoBertaMCQParallelReader() else: logger.error(f"Invalid MCQ model name {args.mcq_model}") exit(0) if args.do_train: # Prepare data loader # get data loader for train/dev train_data = data_reader.read(args.training_data_path, tokenizer, args.max_seq_length, args.max_number_premises) if args.local_rank == -1: train_sampler = RandomSampler(train_data) else: train_sampler = DistributedSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) eval_data = data_reader.read(args.validation_data_path, tokenizer, args.max_seq_length, args.max_number_premises) eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) # num_train_optimization_steps, dividing by effective batch size t_total = (len(train_dataloader) // args.gradient_accumulation_steps) * args.num_train_epochs num_train_optimization_steps = ( len(train_dataloader) // args.gradient_accumulation_steps) * args.num_train_epochs if args.local_rank != -1: num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size( ) # Prepare optimizer # Prepare optimizer and schedule (linear warmup and decay) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) ], 'weight_decay': args.weight_decay }, { 'params': [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) ], 'weight_decay': 0.0 }] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) scheduler = WarmupLinearSchedule(optimizer, warmup_steps=args.warmup_steps, t_total=t_total) model.to(device) if args.fp16: try: from apex import amp except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use fp16 training." ) model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level) if args.local_rank != -1: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) model = DDP(model) elif n_gpu > 1 and not args.no_cuda: model = torch.nn.DataParallel(model) global_step = 0 number_of_batches_per_epoch = len(train_dataloader) if args.eval_freq > 0: steps_to_eval = args.eval_freq else: steps_to_eval = number_of_batches_per_epoch logger.info("***** Training *****") logger.info(" num examples = %d", len(train_data)) logger.info(" batch size = %d", args.train_batch_size) logger.info(" num steps = %d", num_train_optimization_steps) logger.info(" number of Gpus= %d", n_gpu) logger.info("***** Evaluation *****") logger.info(" num examples = %d", len(eval_data)) logger.info(" batch size = %d", args.eval_batch_size) best_acc = 0.0 best_epoch = 1 for epoch_index in trange(int(args.num_train_epochs), desc="Epoch"): epoch_start_time = time.time() model.train() tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 tq = tqdm(train_dataloader, desc="Iteration") acc = 0 for step, batch in enumerate(tq): batch = tuple(t.to(device) for t in batch) if not args.with_score: input_ids, segment_ids, input_mask, label_ids = batch outputs = model(input_ids, segment_ids, input_mask, label_ids) else: input_ids, segment_ids, input_mask, scores, label_ids = batch outputs = model(input_ids, segment_ids, input_mask, scores, label_ids) loss = outputs[0] logits = outputs[1] logits = logits.detach().cpu().numpy() label_ids = label_ids.to('cpu').numpy() tmp_accuracy = accuracy(logits, label_ids) acc += tmp_accuracy if n_gpu > 1: loss = loss.mean( ) # mean() to average on multi-gpu parallel training if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() if args.max_grad_norm is not None: torch.nn.utils.clip_grad_norm_( amp.master_params(optimizer), args.max_grad_norm) else: loss.backward() if args.max_grad_norm is not None: torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) if (step + 1) % args.gradient_accumulation_steps == 0: scheduler.step() # Update learning rate schedule optimizer.step() model.zero_grad() global_step += 1 tr_loss += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 tq.set_description( _get_loss_accuracy(tr_loss / nb_tr_steps, acc / nb_tr_examples)) # TODO: always eval on last batch # For now select the batch_size appropriately if (((step + 1) % steps_to_eval == 0) or (step+1)==number_of_batches_per_epoch )\ and args.do_eval and (args.local_rank == -1 or torch.distributed.get_rank() == 0): model.eval() eval_loss, eval_accuracy = 0, 0 nb_eval_steps, nb_eval_examples = 0, 0 etq = tqdm(eval_dataloader, desc="Validating") for batch in etq: batch = tuple(t.to(device) for t in batch) with torch.no_grad(): if not args.with_score: input_ids, segment_ids, input_mask, label_ids = batch outputs = model(input_ids, segment_ids, input_mask, label_ids) else: input_ids, segment_ids, input_mask, scores, label_ids = batch outputs = model(input_ids, segment_ids, input_mask, scores, label_ids) tmp_eval_loss = outputs[0] logits = outputs[1] logits = logits.detach().cpu().numpy() label_ids = label_ids.to('cpu').numpy() tmp_eval_accuracy = accuracy(logits, label_ids) eval_loss += tmp_eval_loss.mean().item() eval_accuracy += tmp_eval_accuracy nb_eval_examples += input_ids.size(0) nb_eval_steps += 1 etq.set_description( _get_loss_accuracy( eval_loss / nb_eval_steps, eval_accuracy / nb_eval_examples)) eval_loss = eval_loss / nb_eval_steps eval_accuracy = eval_accuracy / nb_eval_examples logger.info(f"epoch, step | {epoch_index}, {step}") logger.info(" | Training | Validation") logger.info("accuracy | %.4f" % (acc / nb_tr_examples) + " | %.4f" % eval_accuracy) logger.info("loss | %.4f" % (tr_loss / nb_tr_steps) + " | %.4f" % eval_loss) best_acc = max(best_acc, eval_accuracy) if eval_accuracy == best_acc: best_epoch = (epoch_index, step) logger.info( "best validation performance so far %.4f: " % best_acc + ", best epoch: " + str(best_epoch) + ". saving current model to " + args.output_dir) # Save a trained model, configuration and tokenizer model_to_save = model.module if hasattr( model, 'module') else model # Only save the model it-self # If we save using the predefined names, we can load using `from_pretrained` output_model_file = os.path.join( args.output_dir, WEIGHTS_NAME) output_config_file = os.path.join( args.output_dir, CONFIG_NAME) torch.save(model_to_save.state_dict(), output_model_file) model_to_save.config.to_json_file(output_config_file) tokenizer.save_vocabulary(args.output_dir) model.train() epoch_end_time = time.time() logger.info( f"time it took to finish the epoch {epoch_index} of {args.num_train_epochs} is " + _show_runtime(epoch_end_time - epoch_start_time)) # Does this even make sense to output? result = { 'eval_accuracy': best_acc, 'global_step': global_step, 'best_epoch': best_epoch } cleanup_global_logging(stdout_handler) output_eval_file = os.path.join(args.output_dir, "eval_results.txt") with open(output_eval_file, "w") as writer: logger.info("***** Eval results *****") for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key])))
def main(): parser = ArgumentParser() parser.add_argument('--pregenerated_data', type=Path, default='training/') parser.add_argument('--output_dir', type=Path, default='./model') parser.add_argument("--epochs", type=int, default=24000, help="Number of epochs to train for") parser.add_argument("--no_cuda", default=False) parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument('--gradient_accumulation_steps', type=int, default=2, help="Number of updates steps to accumulate before performing a backward/update pass.") parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.") parser.add_argument('--fp16', default=False, help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument('--loss_scale', type=float, default=0, help="Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.\n") parser.add_argument("--warmup_steps", default=0, type=int, help="Linear warmup over warmup_steps.") parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.") parser.add_argument("--learning_rate", default=3e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") args = parser.parse_args() assert args.pregenerated_data.is_dir(), \ "--pregenerated_data should point to the folder of files made by pregenerate_training_data.py!" samples_per_epoch = [] for i in range(args.epochs): epoch_file = args.pregenerated_data / f"epoch_{i}.json" metrics_file = args.pregenerated_data / f"epoch_{i}_metrics.json" if epoch_file.is_file() and metrics_file.is_file(): metrics = json.loads(metrics_file.read_text()) samples_per_epoch.append(metrics['num_training_examples']) else: if i == 0: exit("No training data was found!") print(f"Warning! There are fewer epochs of pregenerated data ({i}) than training epochs ({args.epochs}).") print("This script will loop over the available data, but training diversity may be negatively impacted.") num_data_epochs = i break else: num_data_epochs = args.epochs if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logging.info("device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".format( device, n_gpu, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError("Invalid gradient_accumulation_steps parameter: {}, should be >= 1".format( args.gradient_accumulation_steps)) args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if args.output_dir.is_dir() and list(args.output_dir.iterdir()): logging.warning(f"Output directory ({args.output_dir}) already exists and is not empty!") args.output_dir.mkdir(parents=True, exist_ok=True) tokenizer = Tokenizer(sp_path) total_train_examples = 0 for i in range(args.epochs): # The modulo takes into account the fact that we may loop over limited epochs of data total_train_examples += samples_per_epoch[i % len(samples_per_epoch)] num_train_optimization_steps = int( total_train_examples / args.train_batch_size / args.gradient_accumulation_steps) if args.local_rank != -1: num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size() # Prepare model model = BertForPreTraining.from_pretrained(model_path) model.to(device) # Prepare optimizer param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [ {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01}, {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} ] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) scheduler = WarmupLinearSchedule(optimizer, warmup_steps=args.warmup_steps, t_total=num_train_optimization_steps) global_step = 0 logging.info("***** Running training *****") logging.info(f" Num examples = {total_train_examples}") logging.info(f" Batch size = {args.train_batch_size}") logging.info(f" Num steps = {num_train_optimization_steps}") model.train() for epoch in range(args.epochs): epoch_dataset = PregeneratedDataset(epoch=epoch, training_path=args.pregenerated_data, tokenizer=tokenizer, num_data_epochs=num_data_epochs) if args.local_rank == -1: train_sampler = RandomSampler(epoch_dataset) else: train_sampler = DistributedSampler(epoch_dataset) train_dataloader = DataLoader(epoch_dataset, sampler=train_sampler, batch_size=args.train_batch_size) tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 with tqdm(total=len(train_dataloader), desc=f"Epoch {epoch}") as pbar: for step, batch in enumerate(train_dataloader): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, lm_label_ids, is_next = batch outputs = model(input_ids, segment_ids, input_mask, lm_label_ids, is_next) loss = outputs[0] if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps loss.backward() tr_loss += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 pbar.update(1) mean_loss = tr_loss * args.gradient_accumulation_steps / nb_tr_steps pbar.set_postfix_str(f"Loss: {mean_loss:.5f}") writer.add_scalar('Loss/train', mean_loss, (epoch + 1) * step) if (step + 1) % args.gradient_accumulation_steps == 0: optimizer.step() scheduler.step(epoch) optimizer.zero_grad() global_step += 1 # Save a trained model if args.local_rank == -1 or torch.distributed.get_rank() == 0: logging.info("** ** * Saving fine-tuned model ** ** * ") model_to_save = model.module if hasattr(model, 'module') else model model_to_save.save_pretrained(args.output_dir) tokenizer.save_pretrained(args.output_dir)
def main(): parser = ArgumentParser() parser.add_argument('--pregenerated_data', type=Path, required=True) parser.add_argument('--output_dir', type=Path, required=True) parser.add_argument( "--bert_model", type=str, required=True, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese." ) parser.add_argument("--do_lower_case", action="store_true") parser.add_argument( "--reduce_memory", action="store_true", help= "Store training data as on-disc memmaps to massively reduce memory usage" ) parser.add_argument("--epochs", type=int, default=3, help="Number of epochs to train for") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument("--no_cuda", action='store_true', help="Whether not to use CUDA when available") parser.add_argument( '--gradient_accumulation_steps', type=int, default=1, help= "Number of updates steps to accumulate before performing a backward/update pass." ) parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.") parser.add_argument( '--fp16', action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument( '--loss_scale', type=float, default=0, help= "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.\n") parser.add_argument("--warmup_steps", default=0, type=int, help="Linear warmup over warmup_steps.") parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.") parser.add_argument("--learning_rate", default=3e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") args = parser.parse_args() assert args.pregenerated_data.is_dir(), \ "--pregenerated_data should point to the folder of files made by pregenerate_training_data.py!" samples_per_epoch = [] for i in range(args.epochs): epoch_file = args.pregenerated_data / f"epoch_{i}.json" metrics_file = args.pregenerated_data / f"epoch_{i}_metrics.json" if epoch_file.is_file() and metrics_file.is_file(): metrics = json.loads(metrics_file.read_text()) samples_per_epoch.append(metrics['num_training_examples']) else: if i == 0: exit("No training data was found!") print( f"Warning! There are fewer epochs of pregenerated data ({i}) than training epochs ({args.epochs})." ) print( "This script will loop over the available data, but training diversity may be negatively impacted." ) num_data_epochs = i break else: num_data_epochs = args.epochs if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logging.info( "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}". format(device, n_gpu, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(args.gradient_accumulation_steps)) args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if args.output_dir.is_dir() and list(args.output_dir.iterdir()): logging.warning( f"Output directory ({args.output_dir}) already exists and is not empty!" ) args.output_dir.mkdir(parents=True, exist_ok=True) tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) total_train_examples = 0 for i in range(args.epochs): # The modulo takes into account the fact that we may loop over limited epochs of data total_train_examples += samples_per_epoch[i % len(samples_per_epoch)] num_train_optimization_steps = int(total_train_examples / args.train_batch_size / args.gradient_accumulation_steps) if args.local_rank != -1: num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size( ) # Prepare model model = BertForMaskedLM.from_pretrained(args.bert_model) # We don't need to manually call model.half() following Apex's recommend # if args.fp16: # model.half() model.to(device) if args.local_rank != -1: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) model = DDP(model) elif n_gpu > 1: model = torch.nn.DataParallel(model) # Prepare optimizer param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) scheduler = WarmupLinearSchedule(optimizer, warmup_steps=args.warmup_steps, t_total=num_train_optimization_steps) if args.fp16: try: # from apex.optimizers import FP16_Optimizer # from apex.optimizers import FusedAdam from apex import amp except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) # This below line of code is the main upgrade of Apex Fp16 implementation. I chose opt_leve="01" # because it's recommended for typical use by Apex. We can make it configured model, optimizer = amp.initialize(model, optimizer, opt_level="O1") # We don't need to use FP16_Optimizer wrapping over FusedAdam as well. Now Apex supports all Pytorch Optimizer # optimizer = FusedAdam(optimizer_grouped_parameters, # lr=args.learning_rate, # bias_correction=False, # max_grad_norm=1.0) # if args.loss_scale == 0: # optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) # else: # optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale) # else: # optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) # scheduler = WarmupLinearSchedule(optimizer, warmup_steps=args.warmup_steps, t_total=num_train_optimization_steps) global_step = 0 logging.info("***** Running training *****") logging.info(f" Num examples = {total_train_examples}") logging.info(" Batch size = %d", args.train_batch_size) logging.info(" Num steps = %d", num_train_optimization_steps) model.train() for epoch in range(args.epochs): epoch_dataset = PregeneratedDataset( epoch=epoch, training_path=args.pregenerated_data, tokenizer=tokenizer, num_data_epochs=num_data_epochs, reduce_memory=args.reduce_memory) if args.local_rank == -1: train_sampler = RandomSampler(epoch_dataset) else: train_sampler = DistributedSampler(epoch_dataset) train_dataloader = DataLoader(epoch_dataset, sampler=train_sampler, batch_size=args.train_batch_size) tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 with tqdm(total=len(train_dataloader), desc=f"Epoch {epoch}") as pbar: for step, batch in enumerate(train_dataloader): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, lm_label_ids, is_next = batch outputs = model(input_ids, attention_mask=input_mask, masked_lm_labels=lm_label_ids) loss = outputs[0] if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: # I depricate FP16_Optimizer's backward func and replace as Apex document # optimizer.backward(loss) with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() tr_loss += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 pbar.update(1) mean_loss = tr_loss * args.gradient_accumulation_steps / nb_tr_steps pbar.set_postfix_str(f"Loss: {mean_loss:.5f}") if (step + 1) % args.gradient_accumulation_steps == 0: optimizer.step() scheduler.step() # Update learning rate schedule optimizer.zero_grad() global_step += 1 # Save a trained model if args.local_rank == -1 or torch.distributed.get_rank() == 0: logging.info("** ** * Saving fine-tuned model ** ** * ") model_to_save = model.module if hasattr( model, 'module') else model # Take care of distributed/parallel training model_to_save.save_pretrained(args.output_dir) tokenizer.save_pretrained(args.output_dir)
def main(): parser = ArgumentParser() parser.add_argument('--pregenerated_data', type=Path, required=True) parser.add_argument('--output_dir', type=str, required=True) parser.add_argument("--bert_model", type=str, required=True, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese.") parser.add_argument("--do_lower_case", action="store_true") parser.add_argument("--reduce_memory", action="store_true", help="Store training data as on-disc memmaps to massively reduce memory usage") parser.add_argument("--epochs", type=int, default=3, help="Number of epochs to train for") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument("--no_cuda", action='store_true', help="Whether not to use CUDA when available") parser.add_argument('--gradient_accumulation_steps', type=int, default=4, help="Number of updates steps to accumulate before performing a backward/update pass.") parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.") parser.add_argument('--fp16', action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument('--loss_scale', type=float, default=0, help="Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.\n") parser.add_argument("--warmup_steps", default=0, type=int, help="Linear warmup over warmup_steps.") parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.") parser.add_argument("--learning_rate", default=3e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument('--fp16_opt_level', type=str, default='O1', help="For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']." "See details at https://nvidia.github.io/apex/amp.html") parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.") args = parser.parse_args() assert args.pregenerated_data.is_dir(), \ "--pregenerated_data should point to the folder of files made by pregenerate_training_data.py!" samples_per_epoch = [] for i in range(args.epochs): epoch_file = args.pregenerated_data / f"epoch_{i}.json" metrics_file = args.pregenerated_data / f"epoch_{i}_metrics.json" if epoch_file.is_file() and metrics_file.is_file(): metrics = json.loads(metrics_file.read_text()) samples_per_epoch.append(metrics['num_training_examples']) else: if i == 0: exit("No training data was found!") print(f"Warning! There are fewer epochs of pregenerated data ({i}) than training epochs ({args.epochs}).") print("This script will loop over the available data, but training diversity may be negatively impacted.") num_data_epochs = i break else: num_data_epochs = args.epochs if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logging.info("device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".format( device, n_gpu, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError("Invalid gradient_accumulation_steps parameter: {}, should be >= 1".format( args.gradient_accumulation_steps)) args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) # Create output directory if needed if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) total_train_examples = 0 for i in range(args.epochs): # The modulo takes into account the fact that we may loop over limited epochs of data total_train_examples += samples_per_epoch[i % len(samples_per_epoch)] num_train_optimization_steps = int( total_train_examples / args.train_batch_size / args.gradient_accumulation_steps) if args.local_rank != -1: num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size() # Prepare model model = BertForPreTraining.from_pretrained(args.bert_model) model.to(device) # Prepare optimizer param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [ {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01}, {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} ] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) scheduler = WarmupLinearSchedule(optimizer, warmup_steps=args.warmup_steps, t_total=num_train_optimization_steps) model.to(device) if args.fp16: try: from apex import amp except ImportError: raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.") model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level) if args.local_rank != -1: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.") model = DDP(model) elif n_gpu > 1: model = torch.nn.DataParallel(model) global_step = 0 logging.info("***** Running training *****") logging.info(f" Num examples = {total_train_examples}") logging.info(" Batch size = %d", args.train_batch_size) logging.info(" Num steps = %d", num_train_optimization_steps) for epoch in range(args.epochs): model.train() epoch_dataset = PregeneratedDataset(epoch=epoch, training_path=args.pregenerated_data, tokenizer=tokenizer, num_data_epochs=num_data_epochs, reduce_memory=args.reduce_memory) if args.local_rank == -1: train_sampler = RandomSampler(epoch_dataset) else: train_sampler = DistributedSampler(epoch_dataset) train_dataloader = DataLoader(epoch_dataset, sampler=train_sampler, batch_size=args.train_batch_size) tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 with tqdm(total=len(train_dataloader), desc=f"Epoch {epoch}") as pbar: for step, batch in enumerate(train_dataloader): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, lm_label_ids, is_next = batch outputs = model(input_ids, segment_ids, input_mask, lm_label_ids, is_next) loss = outputs[0] if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu parallel training if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm) else: loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) if (step + 1) % args.gradient_accumulation_steps == 0: scheduler.step() # Update learning rate schedule optimizer.step() model.zero_grad() global_step += 1 tr_loss += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 pbar.update(1) mean_loss = tr_loss * args.gradient_accumulation_steps / nb_tr_steps pbar.set_postfix_str(f"Loss: {mean_loss:.5f}") logging.info("** ** * Saving fine-tuned model ** ** * ") output_dir = os.path.join(args.output_dir, 'checkpoint-{}'.format(global_step)) if not os.path.exists(output_dir): os.makedirs(output_dir) model_to_save = model.module if hasattr(model, 'module') else model # Only save the model it-self # If we save using the predefined names, we can load using `from_pretrained` output_model_file = os.path.join(output_dir, WEIGHTS_NAME) output_config_file = os.path.join(output_dir, CONFIG_NAME) torch.save(model_to_save.state_dict(), output_model_file) model_to_save.config.to_json_file(output_config_file) tokenizer.save_vocabulary(output_dir)
def fit(self, X, y, dev=None): tokens, masks = self.tokenize(X) tokens = torch.LongTensor(tokens) masks = torch.LongTensor(masks) y = torch.FloatTensor(y) train_data = TensorDataset(tokens, y, masks) # compute class and instance weights for sampling # for the hierarchical task the weights are computed per column group minmax_ratio = int(os.environ.get('SAMPLING_MINMAX_RATIO', -1)) if self.hierarchical: class_weights = y.sum(dim=0) for lvl, idx in self.label_hierarchy.items(): weights_ = class_weights[idx] if weights_.min() == weights_.max(): class_weights[idx] = minmax_ratio / 2 continue if minmax_ratio == -1: minmax_ratio = int(weights_.max() / weights_.min()) inverted = (weights_.max() - weights_) + 1 class_weights_ = (((inverted - inverted.min()) * (minmax_ratio - 1)) / (inverted.max() - inverted.min())) + 1 class_weights[idx] = class_weights_ instance_weights = (y * class_weights).mean(dim=1) else: counts = y.sum(dim=0) + 1 if minmax_ratio == -1: minmax_ratio = int(counts.max() / counts.min()) inverted = (counts.max() - counts) + 1 class_weights = (((inverted - inverted.min()) * (minmax_ratio - 1)) / (inverted.max() - inverted.min())) + 1 instance_weights = (y * class_weights).max(dim=1)[0] train_sampler = WeightedRandomSampler(instance_weights, len(X), replacement=True) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=self.batch_size) if self.hierarchical: model = BertForHierarchicalMultilabelSequenceClassification.from_pretrained( self.model_name, num_labels=len(y[0])) model.set_hierarchy(self.label_hierarchy) else: model = BertForMultilabelSequenceClassification.from_pretrained( self.model_name, num_labels=len(y[0])) model.loss = self.loss param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) ], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] #optimizer_grouped_parameters = [{"params": [p for n, p in param_optimizer]}] num_total_steps = self.n_epochs * (len(train_dataloader.sampler) // self.batch_size // self.gradient_accumulation_steps) num_warmup_steps = int(num_total_steps * 0.15) # To reproduce BertAdam specific behavior set correct_bias=False optimizer = AdamW(optimizer_grouped_parameters, lr=3e-5, correct_bias=False) # PyTorch scheduler scheduler = WarmupLinearSchedule(optimizer, warmup_steps=num_warmup_steps, t_total=num_total_steps) DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu') self.model = model.to(DEVICE) minmax_ratio = int(os.environ.get('LOSS_MINMAX_RATIO', -1)) if self.hierarchical: class_weights = y.sum(dim=0) + 1 for lvl, idx in self.label_hierarchy.items(): weights_ = class_weights[idx] if weights_.min() == weights_.max(): class_weights[idx] = minmax_ratio / 2 continue if minmax_ratio == -1: minmax_ratio = int(weights_.max() / weights_.min()) inverted = weights_.max() - weights_ class_weights_ = (((inverted - inverted.min()) * (minmax_ratio - 1)) / (inverted.max() - inverted.min())) + 1 class_weights[idx] = class_weights_ instance_weights = (y * class_weights).mean(dim=1) else: counts = y.sum(dim=0) + 1 if minmax_ratio == -1: minmax_ratio = int(counts.max() / counts.min()) inverted = counts.max() - counts class_weights = (((inverted - inverted.min()) * (minmax_ratio - 1)) / (inverted.max() - inverted.min())) + 1 instance_weights = (y * class_weights).max(dim=1)[0] class_weights = class_weights.to(DEVICE) i_step = 1 # stop pylint from complaining start_time = time.time() gradient_accumulation_steps = self.gradient_accumulation_steps epochs = trange(self.n_epochs, desc="Epoch") for i_epoch in epochs: steps = tqdm(train_dataloader, total=len(train_dataloader.sampler) // train_dataloader.batch_size + 1, desc='Mini-batch') train_loss = 0 batch_loss = 0 self.model = model.train() for i_step, batch in enumerate(steps): batch = (b.to(DEVICE) for b in batch) batch_input, batch_targets, batch_masks = batch loss, *_ = model(batch_input, label_hierarchy=self.label_hierarchy, labels=batch_targets, attention_mask=batch_masks, class_weights=class_weights) loss = loss / gradient_accumulation_steps loss.backward() batch_loss += loss.item() train_loss += loss.item() if (gradient_accumulation_steps <= 1 or (i_step + 1) % gradient_accumulation_steps == 0): batch_loss = batch_loss / self.gradient_accumulation_steps steps.set_postfix_str( f'loss {batch_loss:.4f} || ' f'avg. loss {train_loss / (i_step + 1):.4f}') with open('loss.txt', 'a') as fh: fh.write( f'batch\t{i_step}\t{batch_loss:.10f}\ttrain\n') optimizer.step() scheduler.step() optimizer.zero_grad() batch_loss = 0 if callable(self.post_epoch_hook): self.post_epoch_hook(self, i_epoch, dev) with open('loss.txt', 'a') as fh: fh.write( f'epoch\t{i_epoch}\t{train_loss / i_step:.10f}\ttrain\n') steps.close() epochs.set_postfix_str(f'avg. loss {train_loss / i_step:.4f}') self.model = model.to('cpu') return self
def do_train(self, model, dataloader): param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) ], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] optimizer = AdamW(optimizer_grouped_parameters, lr=self.args.learning_rate) # initilize results epochs, best_epoch = 0, 0 min_or_max = 'min' if self.args.KeyEval in ['Loss'] else 'max' best_valid = 1e8 if min_or_max == 'min' else 0 while (epochs - best_epoch < self.args.early_stop): epochs += 1 # train y_pred, y_true = [], [] losses = [] model.train() train_loss = 0.0 left_epochs = self.args.update_epochs with tqdm(dataloader['train']) as td: for batch_data in td: if left_epochs == self.args.update_epochs: optimizer.zero_grad() left_epochs -= 1 text = batch_data['text'].to(self.args.device) audio = batch_data['audio'].to(self.args.device) vision = batch_data['vision'].to(self.args.device) labels = batch_data['labels']['M'].squeeze().to( self.args.device) # forward outputs = model(text, audio, vision) logits = outputs[0].squeeze() # compute loss if self.args.output_mode == "classification": loss = self.classification_criterion( logits.view(-1, self.args.num_labels), labels.view(-1)) elif self.args.output_mode == "regression": loss = self.regression_criterion( logits.view(-1), labels.view(-1)) # backward loss.backward() # store results train_loss += loss.item() y_pred.append(logits.cpu()) y_true.append(labels.cpu()) if not left_epochs: optimizer.step() left_epochs = self.args.update_epochs if not left_epochs: # update optimizer.step() train_loss = train_loss / len(dataloader['train']) print("TRAIN-(%s) (%d/%d/%d)>> loss: %.4f " % (self.args.modelName, \ epochs-best_epoch, epochs, self.args.cur_time, train_loss)) pred, true = torch.cat(y_pred), torch.cat(y_true) train_results = self.metrics(pred, true, exclude_zero=self.args.excludeZero) print('%s: >> ' % (self.args.tasks) + dict_to_str(train_results)) # validation val_results = self.do_test(model, dataloader['valid'], mode="VAL") cur_valid = val_results[self.args.tasks[0]][self.args.KeyEval] # save best model isBetter = cur_valid <= ( best_valid - 1e-6) if min_or_max == 'min' else cur_valid >= ( best_valid + 1e-6) if isBetter: best_valid, best_epoch = cur_valid, epochs model_path = os.path.join(self.args.model_save_path,\ f'{self.args.modelName}-{self.args.datasetName}-{self.args.tasks}.pth') if os.path.exists(model_path): os.remove(model_path) # save model torch.save(model.cpu().state_dict(), model_path) model.to(self.args.device) print('save model in %s...' % model_path) self.do_test(model, dataloader['test'], mode="TEST")
def train(args, device, model, tokenizer): ''' Create a logger and tensorboard writer ''' logger = logging.getLogger(__name__) tb_writer = SummaryWriter(log_dir=args.tsbd_dir) ''' Create a training dataset and dataloader ''' train_dataset, num_labels = model_utils.load_and_cache_examples(args, tokenizer) train_sampler = RandomSampler(train_dataset) train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size, num_workers=args.num_cpu_workers, pin_memory=True) print("Length of the dataloader is ", len(train_dataloader)) num_train_optimization_steps = len(train_dataloader) // \ args.gradient_accumulation_steps * args.num_train_epochs print("Number of the total training steps = ", num_train_optimization_steps) ''' Create an optimizer and a scheduler instance ''' # Below is a little complicated - # they changed the implementation of the BertAdam to make AdamW without # any gradient clipping so now you have to do your own. # Read details at the bottom of readme at # https://github.com/huggingface/pytorch-transformers#migrating-from-pytorch-pretrained-bert-to-pytorch-transformers no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] param_optimizer = list(model.named_parameters()) optimizer_grouped_parameters = [ {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': args.weight_decay}, {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} ] args.warmup_steps = args.warmup_proportion * num_train_optimization_steps optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, correct_bias=False) # eps = args.adam_epsilon (can define); correct_bias can be set # to false like in the original tensorflow repository if args.scheduler == 'WarmupLinearSchedule': scheduler = WarmupLinearSchedule(optimizer, warmup_steps=args.warmup_steps, t_total=num_train_optimization_steps) if args.scheduler == 'ReduceLROnPlateau': scheduler = lr_scheduler.ReduceLROnPlateau(optimizer, threshold=1e-6) ''' Log some important training parameters ''' logger.info("***** Running training *****") logger.info(" Data split file: %s", args.data_split_path) logger.info(" Data split mode: %s", args.data_split_mode) logger.info(" Training fold = %s\t Validation fold = %s"%(args.training_folds, args.validation_folds)) logger.info(" Num examples = %d", len(train_dataset)) logger.info(" Num Epochs = %d", args.num_train_epochs) logger.info(" Total train batch size = %d", args.train_batch_size) logger.info(" Gradient Accumulation steps = %d", args.gradient_accumulation_steps) logger.info(" Total optimization steps = %d", num_train_optimization_steps) logger.info(" Initial learning rate = %f", args.learning_rate) logger.info(" Learning rate scheduler = %s", args.scheduler) logger.info(" Number of output channels = %s", num_labels) ''' Train the model ''' global_step = 0 tr_loss, logging_loss = 0.0, 0.0 last_epoch_loss = 0.0 last_epoch_global_step = 0 logging_img_loss, logging_txt_loss, logging_joint_loss = 0.0, 0.0, 0.0 last_epoch_img_loss, last_epoch_txt_loss, last_epoch_joint_loss = 0.0, 0.0, 0.0 tr_img_loss, tr_txt_loss, tr_joint_loss = 0.0, 0.0, 0.0 # https://discuss.pytorch.org/t/why-do-we-need-to-set-the-gradients-manually-to-zero-in-pytorch/4903/7 # to check the purpose of zero-ing out the gradients between minibatches train_iterator = trange(int(args.num_train_epochs), desc="Epoch") model.train() for epoch in train_iterator: epoch_iterator = tqdm(train_dataloader, desc="Iteration") tr_epoch_loss = 0 for step, batch in enumerate(epoch_iterator): batch = tuple(t.to(device=device, non_blocking=True) for t in batch) image, label_raw, txt_ids, txt_mask, txt_segment_ids, \ label_onehot_or_ordinal, report_id = batch # label_raw is always 0-3 and # label_onehot_or_ordinal is one-hot or ordinal # depending on if it's multiclass or multilabel # report_id is the radiology report study ID that's unique to each report inputs = { 'input_img': image, 'input_ids': txt_ids, 'attention_mask': txt_mask, 'token_type_ids': txt_segment_ids, 'labels': None, 'bert_pool_last_hidden': args.bert_pool_last_hidden, 'bert_pool_use_img': args.bert_pool_use_img, 'bert_pool_img_lowerlevel': args.bert_pool_img_lowerlevel} outputs = model(**inputs) img_embedding, img_logits, txt_embedding, txt_logits = outputs[:4] # Model outputs are always tuple in pytorch-transformers (see doc) ''' Adjust the cross entropy loss function for different label encoding options ''' if args.output_channel_encoding == 'multilabel' and \ args.training_mode != 'semisupervised_phase1': label_ordinal = label_onehot_or_ordinal # Replace the image label with the ordinally encoded label BCE_loss_criterion = BCEWithLogitsLoss() img_loss = BCE_loss_criterion(img_logits.view(-1, num_labels), label_ordinal.view(-1, num_labels).float()) txt_loss = BCE_loss_criterion(txt_logits.view(-1, num_labels), label_ordinal.view(-1, num_labels).float()) elif args.output_channel_encoding == 'multiclass' and \ args.training_mode != 'semisupervised_phase1': label = label_raw CrossEntropyCriterion = CrossEntropyLoss() # In this case, softmax is added in the model # and the CrossEntropyCriterion only accepts raw labels 0-3 img_loss = CrossEntropyCriterion(img_logits.view(-1, num_labels), label.view(-1).long()) txt_loss = CrossEntropyCriterion(txt_logits.view(-1, num_labels), label.view(-1).long()) ''' Define loss functions ''' if args.joint_loss_method == 'l2': joint_loss_criterion = torch.nn.MSELoss() joint_loss = joint_loss_criterion(img_embedding, txt_embedding) elif args.joint_loss_method == 'cosine': joint_loss_criterion = torch.nn.CosineEmbeddingLoss() y = torch.ones(img_embedding.shape[0], device=device) y.requires_grad = False joint_loss = joint_loss_criterion(x1=img_embedding, x2=txt_embedding, y=y) # y is ones so the joint loss is the negative inverse of cosine elif args.joint_loss_method == 'dot': joint_loss = custom_loss.dot_product_loss(img_embedding, txt_embedding) elif args.joint_loss_method == 'ranking': joint_loss = custom_loss.ranking_loss( img_embedding, txt_embedding, label_raw, report_id, similarity_function=args.joint_loss_similarity_function) if args.training_mode == 'supervised' or \ args.training_mode == 'semisupervised_phase2': loss = img_loss+txt_loss+joint_loss if args.training_mode == 'semisupervised_phase1': loss = joint_loss img_loss = joint_loss txt_loss = joint_loss # img_loss and txt_loss will not be computed and optimized # in the training mode of semisupervised_phase1 if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps loss.backward() if 'grad' in optimizer_grouped_parameters: torch.nn.utils.clip_grad_norm_(optimizer_grouped_parameters, args.max_grad_norm) ''' Run optimizer and log loss terms during training ''' tr_loss += loss.item() tr_epoch_loss += loss.item() tr_img_loss += img_loss.item() tr_txt_loss += txt_loss.item() tr_joint_loss += joint_loss.item() if epoch == args.num_train_epochs - 1: last_epoch_loss += loss.item() last_epoch_img_loss += img_loss.item() last_epoch_txt_loss += txt_loss.item() last_epoch_joint_loss += joint_loss.item() if (step + 1) % args.gradient_accumulation_steps == 0: optimizer.step() # important: Pytorch 0.1 and above needs optimizer step to happen before # see https://pytorch.org/docs/stable/optim.html#how-to-adjust-learning-rate # and notice this open bug with LR scheduler https://github.com/pytorch/pytorch/issues/22107 if args.scheduler == 'WarmupLinearSchedule': scheduler.step() # Update learning rate scheduler optimizer.zero_grad() global_step += 1 if epoch == args.num_train_epochs -1: last_epoch_global_step += 1 if args.logging_steps > 0 and global_step % args.logging_steps == 0: tb_writer.add_scalar('learning_rate', optimizer.param_groups[0]['lr'], global_step) tb_writer.add_scalar('loss/train', (tr_loss - logging_loss)/args.logging_steps, global_step) tb_writer.add_scalar('loss_img/train', (tr_img_loss - logging_img_loss)/args.logging_steps, global_step) tb_writer.add_scalar('loss_txt/train', (tr_txt_loss - logging_txt_loss)/args.logging_steps, global_step) tb_writer.add_scalar('loss_joint/train', (tr_joint_loss - logging_joint_loss)/args.logging_steps, global_step) logger.info(" [%d, %5d, %5d] learning rate = %.7f"%\ (epoch + 1, step + 1, global_step, optimizer.param_groups[0]['lr'])) logger.info(" [%d, %5d, %5d] loss = %.5f"%\ (epoch + 1, step + 1, global_step, (tr_loss - logging_loss)/args.logging_steps)) logger.info(" [%d, %5d, %5d] joint loss = %.5f"%\ (epoch + 1, step + 1, global_step, (tr_joint_loss - logging_joint_loss)/args.logging_steps)) logger.info(" [%d, %5d, %5d] image loss = %.5f"%\ (epoch + 1, step + 1, global_step, (tr_img_loss - logging_img_loss)/args.logging_steps)) logger.info(" [%d, %5d, %5d] text loss = %.5f"%\ (epoch + 1, step + 1, global_step, (tr_txt_loss - logging_txt_loss)/args.logging_steps)) logging_loss = tr_loss logging_img_loss = tr_img_loss logging_txt_loss = tr_txt_loss logging_joint_loss = tr_joint_loss if args.scheduler == 'ReduceLROnPlateau': scheduler.step(tr_epoch_loss) # Update learning rate scheduler ''' Save model checkpoint ''' if args.save_epochs > 0 and (epoch + 1) % args.save_epochs == 0: output_dir = os.path.join(args.checkpoints_dir, 'checkpoint-{}'.format(epoch+1)) if not os.path.exists(output_dir): os.makedirs(output_dir) model_to_save = model.module if hasattr(model, 'module') else model # Take care of distributed/parallel training model_to_save.save_pretrained(output_dir) logger.info(" Epoch %d loss = %.5f" % (epoch + 1, tr_epoch_loss)) return {'global_step': global_step, 'training_loss': tr_loss / global_step, 'training_img_loss': tr_img_loss / global_step, 'training_txt_loss': tr_txt_loss / global_step, 'training_joint_loss': tr_joint_loss / global_step, 'last_epoch_training_loss': last_epoch_loss / last_epoch_global_step, 'last_epoch_img_loss': last_epoch_img_loss / last_epoch_global_step, 'last_epoch_txt_loss': last_epoch_txt_loss / last_epoch_global_step, 'last_epoch_joint_loss': last_epoch_joint_loss / last_epoch_global_step}
def main(): batch_size = 32 max_seq_len = 128 n_epochs = 3 bert_model = 'bert-base-uncased' learning_rate = 3e-5 adam_epsilon = 1e-8 warmup_steps = 0 num_labels = 1 output_dir = "fine_tuned--{0}--SEQ_LEN={1}--BATCH_SIZE={2}--HEAD={3}".format( bert_model, max_seq_len, batch_size, num_labels) dataset_dir = "dataset\custom_training_set.csv" device = torch.device("cuda" if torch.cuda.is_available() else "cpu") config = BertConfig.from_pretrained(bert_model) config.num_labels = num_labels tokenizer = BertTokenizer.from_pretrained(bert_model) model = BertForSequenceClassification(config) model.to(device) train_dataset = Dataset(dataset_dir, tokenizer, max_seq_len) num_train_optimization_steps = int( len(train_dataset) / batch_size) * n_epochs param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] optimizer = AdamW(optimizer_grouped_parameters, lr=learning_rate, eps=adam_epsilon) scheduler = WarmupLinearSchedule(optimizer, warmup_steps=warmup_steps, t_total=num_train_optimization_steps) train_sampler = data.RandomSampler(train_dataset) train_dataloader = data.DataLoader(train_dataset, sampler=train_sampler, batch_size=batch_size) model.train() for _ in trange(n_epochs, desc="Epoch"): for batch in tqdm(train_dataloader, desc="Iteration"): batch = (t.to(device) for t in batch) input_ids, input_mask, segment_ids, labels = batch outputs = model(input_ids, input_mask, segment_ids, labels) loss = outputs[0] loss.backward() scheduler.step() optimizer.step() optimizer.zero_grad() if not os.path.exists(output_dir): os.mkdir(output_dir) model.save_pretrained(output_dir) tokenizer.save_pretrained(output_dir)
def main(): parser = ArgumentParser() parser.add_argument('--pregenerated_data', type=Path, required=True) parser.add_argument('--output_dir', type=Path, required=True) parser.add_argument( "--bert_model", type=str, default='bert-base-uncased', help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese." ) parser.add_argument("--do_lower_case", action="store_true") parser.add_argument( "--reduce_memory", action="store_true", help= "Store training data as on-disc memmaps to massively reduce memory usage" ) parser.add_argument("--epochs", type=int, default=3, help="Number of epochs to train for") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument("--no_cuda", action='store_true', help="Whether not to use CUDA when available") parser.add_argument( '--gradient_accumulation_steps', type=int, default=1, help= "Number of updates steps to accumulate before performing a backward/update pass." ) parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.") parser.add_argument( '--fp16', action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument( '--loss_scale', type=float, default=0, help= "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.\n") parser.add_argument("--warmup_steps", default=0, type=int, help="Linear warmup over warmup_steps.") parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.") parser.add_argument("--learning_rate", default=3e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") args = parser.parse_args() assert args.pregenerated_data.is_dir(), \ "--pregenerated_data should point to the folder of files made by pregenerate_training_data.py!" samples_per_epoch = [] for i in range(args.epochs): epoch_file = args.pregenerated_data / f"epoch_{i}.json" metrics_file = args.pregenerated_data / f"epoch_{i}_metrics.json" if epoch_file.is_file() and metrics_file.is_file(): metrics = json.loads(metrics_file.read_text()) samples_per_epoch.append(metrics['num_training_examples']) else: if i == 0: exit("No training data was found!") print( f"Warning! There are fewer epochs of pregenerated data ({i}) than training epochs ({args.epochs})." ) print( "This script will loop over the available data, but training diversity may be negatively impacted." ) num_data_epochs = i break else: num_data_epochs = args.epochs if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logging.info( "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}". format(device, n_gpu, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(args.gradient_accumulation_steps)) args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if args.output_dir.is_dir() and list(args.output_dir.iterdir()): logging.warning( f"Output directory ({args.output_dir}) already exists and is not empty!" ) args.output_dir.mkdir(parents=True, exist_ok=True) tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True) total_train_examples = 0 for i in range(args.epochs): # The modulo takes into account the fact that we may loop over limited epochs of data total_train_examples += samples_per_epoch[i % len(samples_per_epoch)] num_train_optimization_steps = int(total_train_examples / args.train_batch_size / args.gradient_accumulation_steps) if args.local_rank != -1: num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size( ) # Prepare model # model = BertForMaskedLM.from_pretrained('bert-base-uncased', # cache_dir=os.path.join(str(PYTORCH_PRETRAINED_BERT_CACHE), 'distributed_{}'.format(args.local_rank))) model = BertForMaskedLM.from_pretrained('bert-base-uncased') model.to(device) if n_gpu > 1: model = torch.nn.DataParallel(model) # Prepare optimizer param_optimizer = list(model.named_parameters()) param_optimizer = [n for n in param_optimizer] no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] # optimizer = BertAdam(optimizer_grouped_parameters, # lr=args.learning_rate, # warmup=args.warmup_proportion, # t_total=num_train_optimization_steps) optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) scheduler = WarmupLinearSchedule(optimizer, warmup_steps=args.warmup_steps, t_total=num_train_optimization_steps) global_step = 0 logging.info("***** Running training *****") logging.info(f" Num examples = {total_train_examples}") logging.info(" Batch size = %d", args.train_batch_size) logging.info(" Num steps = %d", num_train_optimization_steps) model.train() all_losses = [] for epoch in range(args.epochs): epoch_dataset = PregeneratedDataset( epoch=epoch, training_path=args.pregenerated_data, tokenizer=tokenizer, num_data_epochs=num_data_epochs, reduce_memory=args.reduce_memory) if args.local_rank == -1: train_sampler = RandomSampler(epoch_dataset) else: train_sampler = DistributedSampler(epoch_dataset) train_dataloader = DataLoader(epoch_dataset, sampler=train_sampler, batch_size=args.train_batch_size) tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 with tqdm(total=len(train_dataloader), desc=f"Epoch {epoch}") as pbar: for step, batch in enumerate(train_dataloader): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, lm_label_ids = batch outputs = model(input_ids, segment_ids, input_mask, lm_label_ids) loss = outputs[0] if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: optimizer.backward(loss) else: loss.backward() tr_loss += loss.item() all_losses.append(loss.item()) nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 pbar.update(1) mean_loss = tr_loss * args.gradient_accumulation_steps / nb_tr_steps pbar.set_postfix_str(f"Loss: {mean_loss:.5f}") if (step + 1) % args.gradient_accumulation_steps == 0: scheduler.step() # Update learning rate schedule optimizer.step() optimizer.zero_grad() global_step += 1 if step % 100 == 0: plt.plot(all_losses) plt.savefig(args.output_dir / 'losses.png') plt.close() # Save a trained model after each epoch #if n_gpu > 1 and torch.distributed.get_rank() == 0 or n_gpu <=1 : logging.info( "** ** * Saving fine-tuned model after epoch %d ** ** * " % epoch) model.save_pretrained(args.output_dir) tokenizer.save_pretrained(args.output_dir)
nsp_scores = None loss, lm_loss, nsp_loss, img_loss = forward(dialog_encoder, batch, params, sample_size=sample_size) lm_nsp_loss = None if lm_loss is not None and nsp_loss is not None: lm_nsp_loss = lm_loss + nsp_loss loss /= params['batch_multiply'] loss.backward() scheduler.step() if iter_id % params['batch_multiply'] == 0 and iter_id > 0: optimizer.step() optimizer.zero_grad() if iter_id % 10 == 0: end_t = timer() cur_epoch = float(iter_id) / num_iter_epoch timestamp = strftime('%a %d %b %y %X', gmtime()) print_lm_loss = 0 print_nsp_loss = 0 print_lm_nsp_loss = 0 print_img_loss = 0 if lm_loss is not None: print_lm_loss = lm_loss.item() if nsp_loss is not None: print_nsp_loss = nsp_loss.item()
def main(): # os.environ['C UDA_VISIBLE_DEVICES'] = "0,1" batch_size = 64 parser = argparse.ArgumentParser() parser.add_argument( "--bert_model", default="bert-base-uncased", type=str, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese.", ) parser.add_argument( "--from_pretrained", default="bert-base-uncased", type=str, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese.", ) parser.add_argument( "--output_dir", default="save", type=str, help= "The output directory where the model checkpoints will be written.", ) parser.add_argument( "--config_file", default="config/bert_base_6layer_6conect.json", type=str, help="The config file which specified the model details.", ) parser.add_argument( "--num_train_epochs", default=20, type=int, help="Total number of training epochs to perform.", ) parser.add_argument( "--train_iter_multiplier", default=1.0, type=float, help="multiplier for the multi-task training.", ) parser.add_argument( "--train_iter_gap", default=4, type=int, help= "forward every n iteration is the validation score is not improving over the last 3 epoch, -1 means will stop", ) parser.add_argument( "--warmup_proportion", default=0.1, type=float, help= "Proportion of training to perform linear learning rate warmup for. " "E.g., 0.1 = 10%% of training.", ) parser.add_argument("--no_cuda", action="store_true", help="Whether not to use CUDA when available") parser.add_argument( "--do_lower_case", default=True, type=bool, help= "Whether to lower case the input text. True for uncased models, False for cased models.", ) parser.add_argument( "--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus", ) parser.add_argument("--seed", type=int, default=0, help="random seed for initialization") parser.add_argument( "--gradient_accumulation_steps", type=int, default=1, help= "Number of updates steps to accumualte before performing a backward/update pass.", ) parser.add_argument( "--fp16", action="store_true", help="Whether to use 16-bit float precision instead of 32-bit", ) parser.add_argument( "--loss_scale", type=float, default=0, help= "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.\n", ) parser.add_argument( "--num_workers", type=int, default=16, help="Number of workers in the dataloader.", ) parser.add_argument("--save_name", default="", type=str, help="save name for training.") parser.add_argument( "--in_memory", default=False, type=bool, help="whether use chunck for parallel training.", ) parser.add_argument("--optim", default="AdamW", type=str, help="what to use for the optimization.") parser.add_argument("--tasks", default="0", type=str, help="discourse : TASK0") parser.add_argument( "--freeze", default=-1, type=int, help="till which layer of textual stream of vilbert need to fixed.", ) parser.add_argument( "--vision_scratch", action="store_true", help="whether pre-trained the image or not.", ) parser.add_argument("--evaluation_interval", default=1, type=int, help="evaluate very n epoch.") parser.add_argument( "--lr_scheduler", default="mannul", type=str, help="whether use learning rate scheduler.", ) parser.add_argument("--baseline", action="store_true", help="whether use single stream baseline.") parser.add_argument("--resume_file", default="", type=str, help="Resume from checkpoint") parser.add_argument( "--dynamic_attention", action="store_true", help="whether use dynamic attention.", ) parser.add_argument( "--clean_train_sets", default=True, type=bool, help="whether clean train sets for multitask data.", ) parser.add_argument( "--visual_target", default=0, type=int, help="which target to use for visual branch. \ 0: soft label, \ 1: regress the feature, \ 2: NCE loss.", ) parser.add_argument( "--task_specific_tokens", action="store_true", default=False, help="whether to use task specific tokens for the multi-task learning.", ) # todo args = parser.parse_args() with open("vilbert_tasks.yml", "r") as f: task_cfg = edict(yaml.safe_load(f)) random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) torch.backends.cudnn.deterministic = True torch.backends.cudnn.benchmark = False if args.baseline: from pytorch_transformers.modeling_bert import BertConfig from vilbert.basebert import BaseBertForVLTasks else: from vilbert.vilbert import BertConfig from vilbert.vilbert import VILBertForVLTasks task_names = [] task_lr = [] task_id = 1 for i, task_id in enumerate(args.tasks.split("-")): task_id = str(1) task = "TASK" + task_id name = task_cfg[task]["name"] task_names.append(name) task_lr.append(task_cfg[task]["lr"]) base_lr = min(task_lr) loss_scale = {} for i, task_id in enumerate(args.tasks.split("-")): task = "TASK" + task_id loss_scale[task] = task_lr[i] / base_lr if args.save_name: prefix = "-" + args.save_name else: prefix = "" timeStamp = ("-".join("discourse") + "_" + args.config_file.split("/")[1].split(".")[0] + prefix) savePath = os.path.join(args.output_dir, timeStamp) bert_weight_name = json.load( open("config/" + args.bert_model + "_weight_name.json", "r")) if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 torch.distributed.init_process_group(backend="nccl") logger.info( "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}". format(device, n_gpu, bool(args.local_rank != -1), args.fp16)) default_gpu = False if dist.is_available() and args.local_rank != -1: rank = dist.get_rank() if rank == 0: default_gpu = True else: default_gpu = True if default_gpu: if not os.path.exists(savePath): os.makedirs(savePath) config = BertConfig.from_json_file(args.config_file) if default_gpu: # save all the hidden parameters. with open(os.path.join(savePath, "command.txt"), "w") as f: print(args, file=f) # Python 3.x print("\n", file=f) print(config, file=f) # task_batch_size, task_num_iters, task_ids, task_datasets_train, task_datasets_val, task_dataloader_train, task_dataloader_val = LoadDatasets( # args, task_cfg, args.tasks.split("-"),'train' # ) tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) labels = [ "Visible", 'Subjective', 'Action', 'Story', 'Meta', 'Irrelevant', 'Other' ] train_dataset = DiscourseRelationDataset( labels, task_cfg[task]["dataroot"], tokenizer, args.bert_model, task_cfg[task]["max_seq_length"], encoding="utf-8", visual_target=0, batch_size=batch_size, shuffle=False, num_workers=4, cache=5000, drop_last=False, cuda=False, objective=0, visualization=False, ) train_sampler = RandomSampler(train_dataset) train_loader = DataLoader( train_dataset, sampler=train_sampler, batch_size=batch_size, num_workers=0, pin_memory=True, ) # for i in train_loader: # print("hello") # todo task_ids , task_num_tiers task_ids = ['TASK0'] task_num_iters = [100] task_batch_size = task_cfg['TASK0']["batch_size"] print("task_batch_size") print(task_batch_size) logdir = os.path.join(savePath, "logs") tbLogger = utils.tbLogger( logdir, savePath, task_names, task_ids, task_num_iters, args.gradient_accumulation_steps, ) if args.visual_target == 0: config.v_target_size = 1601 config.visual_target = args.visual_target else: config.v_target_size = 2048 config.visual_target = args.visual_target if args.task_specific_tokens: print("*********** config.task_specific_tokens = True ************") config.task_specific_tokens = True if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) task_ave_iter = {} task_stop_controller = {} # for task_id, num_iter in task_num_iters.items(): # task_ave_iter[task_id] = int( # task_cfg[task]["num_epoch"] # * num_iter # * args.train_iter_multiplier # / args.num_train_epochs # ) # task_stop_controller[task_id] = utils.MultiTaskStopOnPlateau( # mode="max", # patience=1, # continue_threshold=0.005, # cooldown=1, # threshold=0.001, # ) # task_ave_iter_list = sorted(task_ave_iter.values()) # median_num_iter = task_ave_iter_list[-1] # num_train_optimization_steps = ( # median_num_iter * args.num_train_epochs // args.gradient_accumulation_steps # ) # num_labels = max([dataset.num_labels for dataset in task_datasets_train.values()]) # num_train_optimization_steps = int( # train_dataset.num_dataset # / task_batch_size # / args.gradient_accumulation_steps # ) * (args.num_train_epochs - args.start_epoch) # num_train_optimization_steps = int( # train_dataset.num_dataset # / task_batch_size # / args.gradient_accumulation_steps # ) * (args.num_train_epochs - args.start_epoch) num_train_optimization_steps = 10 num_labels = len(labels) if args.dynamic_attention: config.dynamic_attention = True if "roberta" in args.bert_model: config.model = "roberta" if args.baseline: model = BaseBertForVLTasks.from_pretrained( args.from_pretrained, config=config, num_labels=num_labels, default_gpu=default_gpu, ) else: model = VILBertForVLTasks.from_pretrained( args.from_pretrained, config=config, num_labels=num_labels, default_gpu=default_gpu, ) model.double() model = model.to(device) task_losses = LoadLosses(args, task_cfg, args.tasks.split("-")) no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"] if args.freeze != -1: bert_weight_name_filtered = [] for name in bert_weight_name: if "embeddings" in name: bert_weight_name_filtered.append(name) elif "encoder" in name: layer_num = name.split(".")[2] if int(layer_num) <= args.freeze: bert_weight_name_filtered.append(name) optimizer_grouped_parameters = [] for key, value in dict(model.named_parameters()).items(): if key[12:] in bert_weight_name_filtered: value.requires_grad = False if default_gpu: print("filtered weight") print(bert_weight_name_filtered) optimizer_grouped_parameters = [] for key, value in dict(model.named_parameters()).items(): if value.requires_grad: if "vil_" in key: lr = 1e-4 else: if args.vision_scratch: if key[12:] in bert_weight_name: lr = base_lr else: lr = 1e-4 else: lr = base_lr if any(nd in key for nd in no_decay): optimizer_grouped_parameters += [{ "params": [value], "lr": lr, "weight_decay": 0.0 }] if not any(nd in key for nd in no_decay): optimizer_grouped_parameters += [{ "params": [value], "lr": lr, "weight_decay": 0.01 }] if default_gpu: print(len(list(model.named_parameters())), len(optimizer_grouped_parameters)) if args.optim == "AdamW": optimizer = AdamW(optimizer_grouped_parameters, lr=base_lr, correct_bias=False, weight_decay=1e-4) elif args.optim == "RAdam": optimizer = RAdam(optimizer_grouped_parameters, lr=base_lr, weight_decay=1e-4) # warmpu_steps = args.warmup_proportion * num_train_optimization_steps # if args.lr_scheduler == "warmup_linear": # warmup_scheduler = WarmupLinearSchedule( # optimizer, warmup_steps=warmpu_steps, t_total=num_train_optimization_steps # ) # else: # warmup_scheduler = WarmupConstantSchedule(optimizer, warmup_steps=warmpu_steps) # # lr_reduce_list = np.array([5, 7]) # if args.lr_scheduler == "automatic": # lr_scheduler = ReduceLROnPlateau( # optimizer, mode="max", factor=0.2, patience=1, cooldown=1, threshold=0.001 # ) # elif args.lr_scheduler == "cosine": # lr_scheduler = CosineAnnealingLR( # optimizer, T_max=median_num_iter * args.num_train_epochs # ) # elif args.lr_scheduler == "cosine_warm": # lr_scheduler = CosineAnnealingWarmRestarts( # # optimizer, T_0=median_num_iter * args.num_train_epochs # ) # elif args.lr_scheduler == "mannul": # # def lr_lambda_fun(epoch): # return pow(0.2, np.sum(lr_reduce_list <= epoch)) # # lr_scheduler = LambdaLR(optimizer, lr_lambda=lr_lambda_fun) startIterID = 0 global_step = 0 start_epoch = 0 if args.resume_file != "" and os.path.exists(args.resume_file): checkpoint = torch.load(args.resume_file, map_location="cpu") new_dict = {} for attr in checkpoint["model_state_dict"]: if attr.startswith("module."): new_dict[attr.replace( "module.", "", 1)] = checkpoint["model_state_dict"][attr] else: new_dict[attr] = checkpoint["model_state_dict"][attr] model.load_state_dict(new_dict) # warmup_scheduler.load_state_dict(checkpoint["warmup_scheduler_state_dict"]) # lr_scheduler.load_state_dict(checkpoint['lr_scheduler_state_dict']) optimizer.load_state_dict(checkpoint["optimizer_state_dict"]) global_step = checkpoint["global_step"] start_epoch = int(checkpoint["epoch_id"]) + 1 task_stop_controller = checkpoint["task_stop_controller"] tbLogger = checkpoint["tb_logger"] del checkpoint model.to(device) for state in optimizer.state.values(): for k, v in state.items(): if torch.is_tensor(v): state[k] = v.cuda() if args.local_rank != -1: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) model = DDP(model, delay_allreduce=True) elif n_gpu > 1: model = torch.nn.DataParallel(model) if default_gpu: print("***** Running training *****") print(" Num Iters: ", task_num_iters) print(" Batch size: ", batch_size) print(" Num steps: %d" % num_train_optimization_steps) task_iter_train = {name: None for name in task_ids} task_count = {name: 0 for name in task_ids} # for epochId in tqdm(range(start_epoch, args.num_train_epochs), desc="Epoch"): # model.train() # torch.autograd.set_detect_anomaly(True) # # for step in range(median_num_iter): # for step in range(1) # # iterId = startIterID + step + (epochId * median_num_iter) # first_task = True # for task_id in task_ids: # is_forward = False # # if (not task_stop_controller[task_id].in_stop) or ( # # iterId % args.train_iter_gap == 0 # # ): # args['start_epoch'] = 0 # args.num_train_epochs criterion = nn.BCEWithLogitsLoss() target_path = os.path.join(task_cfg[task]["dataroot"], "all_targets_json.json") all_targets = json.load(open(target_path, "r")) model = model.to(device) print(next(model.parameters()).is_cuda) for epochId in range(int(start_epoch), int(args.num_train_epochs)): model.train() is_forward = True if is_forward: # print("beforeLoop") # loss, score = ForwardModelsTrain( # args, # task_cfg, # device, # task_id, # task_count, # task_iter_train, # train_dataset, # model, # task_losses, # ) for step, batch in enumerate(train_loader): batch = tuple( t.to(device=device, non_blocking=True) if type(t) == torch.Tensor else t for t in batch) input_ids, input_mask, segment_ids, image_feat, image_loc, image_mask, image_id = ( batch) true_targets = [] for id in image_id: true_targets.append( np.fromiter(all_targets[id].values(), dtype=np.double)) true_targets = torch.from_numpy(np.array(true_targets)) true_targets = true_targets.to(device) model.double() model = model.to(device) discourse_prediction, vil_prediction, vil_prediction_gqa, vil_logit, vil_binary_prediction, vil_tri_prediction, vision_prediction, vision_logit, linguisic_prediction, linguisic_logit, _ \ = model( True, input_ids, image_feat, image_loc, segment_ids, input_mask, image_mask ) loss = criterion(discourse_prediction, true_targets.type(torch.double)) loss.backward() optimizer.step() model.zero_grad() print("train train train done") # print("*********** ITERATION {} ***********".format(epochId)) print("*********** TRAIN PERFORMANCE ***********") print(loss) print( compute_score(discourse_prediction.to('cpu'), true_targets.type(torch.float).to('cpu'), 0.5)) print("*********** TEST PERFORMANCE ***********") evaluate(model, device, task_cfg, tokenizer, args, labels)
def train(self, train_category, dev_category, train_news, dev_news, tokenizer, Net=None, model=None): if os.path.exists(self.arguments.output_config_file) is True: os.remove(self.arguments.output_config_file) logger.info('>>train.shape: {} | dev.shape: {}'.format( train_category.shape, dev_category.shape)) train_dataloader, train_examples_len = Util.load_data( news=train_news, category=train_category, data_type='train', label_list=self.arguments.label_list, max_length=self.arguments.max_seq_length, tokenizer=tokenizer, batch_size=self.arguments.BATCH) dev_dataloader, dev_examples_len = Util.load_data( news=dev_news, category=dev_category, data_type='dev', label_list=self.arguments.label_list, max_length=self.arguments.max_seq_length, tokenizer=tokenizer, batch_size=self.arguments.BATCH) num_train_optimization_steps = int( train_examples_len / self.arguments.BATCH / self.arguments.gradient_accumulation_steps) * self.arguments.EPOCHS # 模型准备 logger.info("model name is {}".format(self.arguments.model_name)) if model is None: if self.arguments.model_name == "BertOrigin": model = Net.from_pretrained( pretrained_model_name_or_path=self.arguments. bert_model_dir, num_labels=self.arguments.num_labels, cache_dir=self.arguments.cache_dir) elif self.arguments.model_name == 'BertHAN': model = Net.from_pretrained( pretrained_model_name_or_path=self.arguments. bert_model_dir, num_labels=self.arguments.num_labels, cache_dir=self.arguments.cache_dir) elif self.arguments.model_name == "BertCNN": filter_sizes = [ int(val) for val in self.arguments.filter_sizes.split() ] model = Net.from_pretrained( pretrained_model_name_or_path=self.arguments. bert_model_dir, num_labels=self.arguments.num_labels, n_filters=self.arguments.filter_num, filter_sizes=filter_sizes, cache_dir=self.arguments.cache_dir) elif self.arguments.model_name == "BertATT": model = Net.from_pretrained( pretrained_model_name_or_path=self.arguments. bert_model_dir, num_labels=self.arguments.num_labels, cache_dir=self.arguments.cache_dir) elif self.arguments.model_name == "BertRCNN": model = Net.from_pretrained( pretrained_model_name_or_path=self.arguments. bert_model_dir, num_labels=self.arguments.num_labels, cache_dir=self.arguments.cache_dir, rnn_hidden_size=self.arguments.rnn_hidden_size, num_layers=self.arguments.num_layers, bidirectional=self.arguments.bidirectional, dropout=self.arguments.dropout) elif self.arguments.model_name == "BertCNNPlus": filter_sizes = [ int(val) for val in self.arguments.filter_sizes.split() ] model = Net.from_pretrained( pretrained_model_name_or_path=self.arguments. bert_model_dir, num_labels=self.arguments.num_labels, cache_dir=self.arguments.cache_dir, n_filters=self.arguments.filter_num, filter_sizes=filter_sizes) model.to(DEVICE) """ 优化器准备 """ param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) ], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] # To reproduce BertAdam specific behavior set correct_bias=False optimizer = AdamW(params=optimizer_grouped_parameters, lr=self.arguments.learning_rate, correct_bias=False) # PyTorch scheduler scheduler = WarmupLinearSchedule( optimizer=optimizer, warmup_steps=self.arguments.warmup_proportion, t_total=num_train_optimization_steps) """ 损失函数准备 """ if self.arguments.use_label_smoothing: criterion = NMTCriterion( label_smoothing=self.arguments.label_smoothing) else: criterion = nn.CrossEntropyLoss() criterion = criterion.to(DEVICE) best_auc, best_acc, global_step, early_stop_times = 0, 0, 0, 0 for epoch in range(int(self.arguments.EPOCHS)): if early_stop_times >= self.arguments.early_stop * ( train_examples_len // self.arguments.BATCH): break logger.info(f'---------------- Epoch: {epoch + 1:02} ----------') for step, batch in enumerate( tqdm(train_dataloader, desc="Iteration")): model.train() if self.arguments.label_smoothing: criterion.train() batch = tuple(t.to(DEVICE) for t in batch) _, input_ids, input_mask, segment_ids, label_ids = batch logits = model(input_ids, segment_ids, input_mask, labels=None) loss = criterion(inputs=logits, labels=label_ids, normalization=1.0, reduce=False) # 修正 if self.arguments.gradient_accumulation_steps > 1: loss = loss / self.arguments.gradient_accumulation_steps loss.backward(torch.ones_like(loss)) scheduler.step() if (step + 1) % self.arguments.gradient_accumulation_steps == 0: optimizer.step() optimizer.zero_grad() global_step += 1 if global_step % self.arguments.print_step == 0 and global_step != 0: dev_loss, dev_acc, dev_report, dev_auc = Util.evaluate( model, dev_dataloader, criterion, DEVICE, self.arguments.label_list, args=self.arguments) logger.info('\n>>>dev report: \n{}'.format(dev_report)) # 以 acc 取优 if dev_acc > best_acc: best_acc = dev_acc # 以 auc 取优 if dev_auc > best_auc: best_auc = dev_auc # 保存模型 model_to_save = model.module if hasattr( model, 'module') else model torch.save(model_to_save.state_dict(), self.arguments.output_model_file) with open(self.arguments.output_config_file, 'w') as f: f.write(model_to_save.config.to_json_string()) early_stop_times = 0 else: early_stop_times += 1 if os.path.exists(self.arguments.output_config_file) is False: model_to_save = model.module if hasattr(model, 'module') else model torch.save(model_to_save.state_dict(), self.arguments.output_model_file) with open(self.arguments.output_config_file, 'w') as f: f.write(model_to_save.config.to_json_string())
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument("--data_dir", default=None, type=str, required=False, help="The input data dir. Should contain the .tsv files (or other data files) for the task.") parser.add_argument("--bert_model", default=None, type=str, required=False, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, " "bert-base-multilingual-cased, bert-base-chinese.") parser.add_argument("--task_name", default=None, type=str, required=True, help="The name of the task to train.") parser.add_argument("--output_dir", default=None, type=str, required=False, help="The output directory where the model predictions and checkpoints will be written.") ## Other parameters parser.add_argument("--cache_dir", default="", type=str, help="Where do you want to store the pre-trained models downloaded from s3") parser.add_argument("--max_seq_length", default=128, type=int, help="The maximum total input sequence length after WordPiece tokenization. \n" "Sequences longer than this will be truncated, and sequences shorter \n" "than this will be padded.") parser.add_argument("--do_train", action='store_true', help="Whether to run training.") parser.add_argument("--do_eval", action='store_true', help="Whether to run eval on the dev set.") parser.add_argument("--do_lower_case", action='store_true', help="Set this flag if you are using an uncased model.") parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.") parser.add_argument("--eval_batch_size", default=8, type=int, help="Total batch size for eval.") parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument("--warmup_proportion", default=0.1, type=float, help="Proportion of training to perform linear learning rate warmup for. " "E.g., 0.1 = 10%% of training.") parser.add_argument("--no_cuda", action='store_true', help="Whether not to use CUDA when available") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument('--gradient_accumulation_steps', type=int, default=1, help="Number of updates steps to accumulate before performing a backward/update pass.") parser.add_argument('--fp16', action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument('--loss_scale', type=float, default=0, help="Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.\n") parser.add_argument('--server_ip', type=str, default='', help="Can be used for distant debugging.") parser.add_argument('--server_port', type=str, default='', help="Can be used for distant debugging.") args = parser.parse_args() processors = {"ner":NerProcessor} if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() if args.gradient_accumulation_steps < 1: raise ValueError("Invalid gradient_accumulation_steps parameter: {}, should be >= 1".format( args.gradient_accumulation_steps)) args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) task_name = args.task_name.lower() if task_name not in processors: raise ValueError("Task not found: %s" % (task_name)) processor = processors[task_name]() label_list = processor.get_labels() num_labels = len(label_list)# + 1 #consider the 0 for padded label pretrain_model_dir = 'bert-base-uncased' tokenizer = BertTokenizer.from_pretrained(pretrain_model_dir, do_lower_case=args.do_lower_case) train_examples = None num_train_optimization_steps = None train_examples = processor.get_negation_train_examples('/export/home/Dataset/negation/starsem-st-2012-data/cd-sco/corpus/training/SEM-2012-SharedTask-CD-SCO-training-09032012.txt') num_train_optimization_steps = int( len(train_examples) / args.train_batch_size / args.gradient_accumulation_steps) * args.num_train_epochs if args.local_rank != -1: num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size() model = NegationModel.from_pretrained(pretrain_model_dir, # cache_dir=cache_dir, num_labels = num_labels) model.to(device) param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [ {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01}, {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} ] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate) global_step = 0 nb_tr_steps = 0 tr_loss = 0 # label_map = {i : label for i, label in enumerate(label_list,1)} if args.do_train: train_features = convert_examples_to_features( train_examples, label_list, args.max_seq_length, tokenizer) logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_examples)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_optimization_steps) all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long) all_cue_label_ids = torch.tensor([f.cue_label_ids for f in train_features], dtype=torch.long) all_scope_label_ids = torch.tensor([f.scope_label_ids for f in train_features], dtype=torch.long) all_valid_ids = torch.tensor([f.valid_ids for f in train_features], dtype=torch.long) all_lmask_ids = torch.tensor([f.label_mask for f in train_features], dtype=torch.long) train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_cue_label_ids, all_scope_label_ids,all_valid_ids,all_lmask_ids) train_sampler = RandomSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) '''load test data''' eval_examples = processor.get_negation_test_examples('/export/home/Dataset/negation/starsem-st-2012-data/cd-sco/corpus/test-gold', ['SEM-2012-SharedTask-CD-SCO-test-cardboard-GOLD.txt', 'SEM-2012-SharedTask-CD-SCO-test-circle-GOLD.txt']) eval_features = convert_examples_to_features(eval_examples, label_list, args.max_seq_length, tokenizer) logger.info("***** Running evaluation *****") logger.info(" Num examples = %d", len(eval_examples)) logger.info(" Batch size = %d", args.eval_batch_size) all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long) all_cue_label_ids = torch.tensor([f.cue_label_ids for f in eval_features], dtype=torch.long) all_scope_label_ids = torch.tensor([f.scope_label_ids for f in eval_features], dtype=torch.long) all_valid_ids = torch.tensor([f.valid_ids for f in eval_features], dtype=torch.long) all_lmask_ids = torch.tensor([f.label_mask for f in eval_features], dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_cue_label_ids, all_scope_label_ids,all_valid_ids,all_lmask_ids) eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) model.train() for _ in trange(int(args.num_train_epochs), desc="Epoch"): nb_tr_examples, nb_tr_steps = 0, 0 for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration")): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, cue_label_ids, scope_label_ids, valid_ids,l_mask = batch loss_cue, loss_scope = model(input_ids, segment_ids, input_mask, cue_label_ids, scope_label_ids,valid_ids,l_mask) loss = loss_cue + loss_scope if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps loss.backward() tr_loss += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 if (step + 1) % args.gradient_accumulation_steps == 0: optimizer.step() optimizer.zero_grad() global_step += 1 print('\nmean loss:', tr_loss/global_step) '''testing''' model.eval() eval_loss, eval_accuracy = 0, 0 nb_eval_steps, nb_eval_examples = 0, 0 y_true_cue = [] y_pred_cue = [] y_true_scope = [] y_pred_scope = [] label_map = {i : label for i, label in enumerate(label_list)} for input_ids, input_mask, segment_ids, cue_label_ids,scope_label_ids,valid_ids,l_mask in tqdm(eval_dataloader, desc="Evaluating"): input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) valid_ids = valid_ids.to(device) cue_label_ids = cue_label_ids.to(device) scope_label_ids = scope_label_ids.to(device) l_mask = l_mask.to(device) with torch.no_grad(): ''' model(input_ids, segment_ids, input_mask, cue_label_ids, scope_label_ids,valid_ids,l_mask) ''' logits_cue, logits_scope = model(input_ids, segment_ids, input_mask,valid_ids=valid_ids,attention_mask_label=l_mask) task = 0 for logits, label_ids in zip([logits_cue, logits_scope], [cue_label_ids, scope_label_ids]): '''we do not want the predicted max label index is 0''' logits = nn.Sigmoid()(logits)# torch.argmax(F.log_softmax(logits,dim=2),dim=2) #(batch, max_len) logits = logits.detach().cpu().numpy() label_ids = label_ids.to('cpu').numpy()#(batch, max_len) # l_mask = l_mask.to('cpu').numpy()#(batch, max_len) for i, label in enumerate(label_ids): '''each sentence''' temp_1 = [] # gold temp_2 = [] # pred for j,m in enumerate(label): '''each word''' if j == 0: # is a pad continue elif l_mask[i][j] == 0: '''this means the gold label is [SEP], the end of sent''' if task == 0: y_true_cue.append(temp_1) y_pred_cue.append(temp_2) else: y_true_scope.append(temp_1) y_pred_scope.append(temp_2) break else: temp_1.append(label_map[label_ids[i][j]]) temp_2.append('1' if logits[i][j][0]>0.3 else '0') task+=1 # print('y_pred_cue:', y_pred_cue) # print('y_true_cue:', y_true_cue) f1_cue = 0.0 for true_cue_list, pred_cue_list in zip(y_true_cue, y_pred_cue): f1_cue+=f1_score(true_cue_list, pred_cue_list, pos_label='1') f1_cue/=len(y_true_cue) print('\ncue f1:', f1_cue) f1_scope = 0.0 for true_scope_list, pred_scope_list in zip(y_true_scope, y_pred_scope): f1_scope+=f1_score(true_scope_list, pred_scope_list, pos_label='1') f1_scope/=len(y_true_scope) print('scope f1:', f1_scope,'\n')
def main(): parser = argparse.ArgumentParser() parser.add_argument( "--bert_model", default="bert-base-uncased", type=str, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese.", ) parser.add_argument( "--from_pretrained", default="bert-base-uncased", type=str, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese.", ) parser.add_argument( "--output_dir", default="save", type=str, help="The output directory where the model checkpoints will be written.", ) parser.add_argument( "--config_file", default="config/bert_base_6layer_6conect.json", type=str, help="The config file which specified the model details.", ) parser.add_argument( "--num_train_epochs", default=20, type=int, help="Total number of training epochs to perform.", ) parser.add_argument( "--train_iter_multiplier", default=1.0, type=float, help="multiplier for the multi-task training.", ) parser.add_argument( "--train_iter_gap", default=4, type=int, help="forward every n iteration is the validation score is not improving over the last 3 epoch, -1 means will stop", ) parser.add_argument( "--warmup_proportion", default=0.1, type=float, help="Proportion of training to perform linear learning rate warmup for. " "E.g., 0.1 = 10%% of training.", ) parser.add_argument( "--no_cuda", action="store_true", help="Whether not to use CUDA when available" ) parser.add_argument( "--do_lower_case", default=True, type=bool, help="Whether to lower case the input text. True for uncased models, False for cased models.", ) parser.add_argument( "--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus", ) parser.add_argument( "--seed", type=int, default=0, help="random seed for initialization" ) parser.add_argument( "--gradient_accumulation_steps", type=int, default=1, help="Number of updates steps to accumualte before performing a backward/update pass.", ) parser.add_argument( "--fp16", action="store_true", help="Whether to use 16-bit float precision instead of 32-bit", ) parser.add_argument( "--loss_scale", type=float, default=0, help="Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.\n", ) parser.add_argument( "--num_workers", type=int, default=16, help="Number of workers in the dataloader.", ) parser.add_argument( "--save_name", default="", type=str, help="save name for training." ) parser.add_argument( "--in_memory", default=False, type=bool, help="whether use chunck for parallel training.", ) parser.add_argument( "--optim", default="AdamW", type=str, help="what to use for the optimization." ) parser.add_argument( "--freeze", default=-1, type=int, help="till which layer of textual stream of vilbert need to fixed.", ) parser.add_argument( "--vision_scratch", action="store_true", help="whether pre-trained the image or not.", ) parser.add_argument( "--evaluation_interval", default=1, type=int, help="evaluate very n epoch." ) parser.add_argument( "--lr_scheduler", default="mannul", type=str, help="whether use learning rate scheduler.", ) parser.add_argument( "--baseline", action="store_true", help="whether use single stream baseline." ) parser.add_argument( "--resume_file", default="", type=str, help="Resume from checkpoint" ) parser.add_argument( "--dynamic_attention", action="store_true", help="whether use dynamic attention.", ) parser.add_argument( "--clean_train_sets", default=True, type=bool, help="whether clean train sets for multitask data.", ) parser.add_argument( "--visual_target", default=0, type=int, help="which target to use for visual branch. \ 0: soft label, \ 1: regress the feature, \ 2: NCE loss.", ) args = parser.parse_args() with open("task_config.yml", "r") as f: task_cfg = edict(yaml.safe_load(f)) random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if args.baseline: from pytorch_transformers.modeling_bert import BertConfig from src.models.basebert import BaseBertForVLTasks else: from src.models.vilbert import BertConfig from src.models.vilbert import VILBertForVLTasks name = task_cfg["name"] task_lr = task_cfg["lr"] base_lr = task_lr loss_scale = task_lr / base_lr if args.save_name: prefix = "-" + args.save_name else: prefix = "" timeStamp = ( args.config_file.split("/")[1].split(".")[0] + prefix ) savePath = os.path.join(args.output_dir, timeStamp) bert_weight_name = json.load( open("config/" + args.bert_model + "_weight_name.json", "r") ) if args.local_rank == -1 or args.no_cuda: device = torch.device( "cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu" ) n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 torch.distributed.init_process_group(backend="nccl") logger.info( "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".format( device, n_gpu, bool(args.local_rank != -1), args.fp16 ) ) default_gpu = False if dist.is_available() and args.local_rank != -1: rank = dist.get_rank() if rank == 0: default_gpu = True else: default_gpu = True if default_gpu: if not os.path.exists(savePath): os.makedirs(savePath) config = BertConfig.from_json_file(args.config_file) if default_gpu: # save all the hidden parameters. with open(os.path.join(savePath, "command.txt"), "w") as f: print(args, file=f) # Python 3.x print("\n", file=f) print(config, file=f) # load dataset task_batch_size, task_num_iters, task_datasets_train, task_datasets_val, task_dataloader_train, task_dataloader_val = LoadDatasets( args, task_cfg ) logdir = os.path.join(savePath, "logs") tbLogger = utils.tbLogger( logdir, savePath, task_num_iters, args.gradient_accumulation_steps, ) if args.visual_target == 0: config.v_target_size = 1601 config.visual_target = args.visual_target else: config.v_target_size = 2048 config.visual_target = args.visual_target if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) task_ave_iter = {} task_stop_controller = {} task_ave_iter = int( task_cfg["num_epoch"] * task_num_iters * args.train_iter_multiplier / args.num_train_epochs ) task_stop_controller = utils.TaskStopOnPlateau( mode="max", patience=1, continue_threshold=0.005, cooldown=1, threshold=0.001, ) median_num_iter = task_ave_iter num_train_optimization_steps = ( median_num_iter * args.num_train_epochs // args.gradient_accumulation_steps ) num_labels = task_datasets_train.num_labels if args.dynamic_attention: config.dynamic_attention = True model = VILBertForVLTasks.from_pretrained( args.from_pretrained, config=config, num_labels=num_labels, default_gpu=default_gpu, ) task_losses = LoadLosses(args, task_cfg) no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"] if args.freeze != -1: bert_weight_name_filtered = [] for name in bert_weight_name: if "embeddings" in name: bert_weight_name_filtered.append(name) elif "encoder" in name: layer_num = name.split(".")[2] if int(layer_num) <= args.freeze: bert_weight_name_filtered.append(name) optimizer_grouped_parameters = [] for key, value in dict(model.named_parameters()).items(): if key[12:] in bert_weight_name_filtered: value.requires_grad = False if default_gpu: print("filtered weight") print(bert_weight_name_filtered) optimizer_grouped_parameters = [] for key, value in dict(model.named_parameters()).items(): if value.requires_grad: if "vil_" in key: lr = 1e-4 else: if args.vision_scratch: if key[12:] in bert_weight_name: lr = base_lr else: lr = 1e-4 else: lr = base_lr if any(nd in key for nd in no_decay): optimizer_grouped_parameters += [ {"params": [value], "lr": lr, "weight_decay": 0.0} ] if not any(nd in key for nd in no_decay): optimizer_grouped_parameters += [ {"params": [value], "lr": lr, "weight_decay": 0.01} ] if default_gpu: print(len(list(model.named_parameters())), len(optimizer_grouped_parameters)) # choose optimizer if args.optim == "AdamW": optimizer = AdamW(optimizer_grouped_parameters, lr=base_lr, correct_bias=False) elif args.optim == "RAdam": optimizer = RAdam(optimizer_grouped_parameters, lr=base_lr) # choose scheduler warmpu_steps = args.warmup_proportion * num_train_optimization_steps if args.lr_scheduler == "warmup_linear": warmup_scheduler = WarmupLinearSchedule( optimizer, warmup_steps=warmpu_steps, t_total=num_train_optimization_steps ) else: warmup_scheduler = WarmupConstantSchedule(optimizer, warmup_steps=warmpu_steps) lr_reduce_list = np.array([5, 7]) if args.lr_scheduler == "automatic": lr_scheduler = ReduceLROnPlateau( optimizer, mode="max", factor=0.2, patience=1, cooldown=1, threshold=0.001 ) elif args.lr_scheduler == "cosine": lr_scheduler = CosineAnnealingLR( optimizer, T_max=median_num_iter * args.num_train_epochs ) elif args.lr_scheduler == "cosine_warm": lr_scheduler = CosineAnnealingWarmRestarts( optimizer, T_0=median_num_iter * args.num_train_epochs ) elif args.lr_scheduler == "mannul": def lr_lambda_fun(epoch): return pow(0.2, np.sum(lr_reduce_list <= epoch)) lr_scheduler = LambdaLR(optimizer, lr_lambda=lr_lambda_fun) startIterID = 0 global_step = 0 start_epoch = 0 if args.resume_file != "" and os.path.exists(args.resume_file): checkpoint = torch.load(args.resume_file, map_location="cpu") new_dict = {} for attr in checkpoint["model_state_dict"]: if attr.startswith("module."): new_dict[attr.replace("module.", "", 1)] = checkpoint[ "model_state_dict" ][attr] else: new_dict[attr] = checkpoint["model_state_dict"][attr] model.load_state_dict(new_dict) warmup_scheduler.load_state_dict(checkpoint["warmup_scheduler_state_dict"]) # lr_scheduler.load_state_dict(checkpoint['lr_scheduler_state_dict']) optimizer.load_state_dict(checkpoint["optimizer_state_dict"]) global_step = checkpoint["global_step"] start_epoch = int(checkpoint["epoch_id"]) + 1 task_stop_controller = checkpoint["task_stop_controller"] tbLogger = checkpoint["tb_logger"] del checkpoint model.to(device) print("`==============`MODEL=============") print(next(model.parameters()).is_cuda)#False for state in optimizer.state.values(): for k, v in state.items(): if torch.is_tensor(v): state[k] = v.cuda() if args.local_rank != -1: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) model = DDP(model, delay_allreduce=True) elif n_gpu > 1: model = torch.nn.DataParallel(model) if default_gpu: print("***** Running training *****") print(" Num Iters: ", task_num_iters) print(" Batch size: ", task_batch_size) print(" Num steps: %d" % num_train_optimization_steps) task_iter_train = None task_count = 0 for epochId in tqdm(range(start_epoch, args.num_train_epochs), desc="Epoch", ncols=100): model.train() for step in range(median_num_iter): iterId = startIterID + step + (epochId * median_num_iter) first_task = True is_forward = False if (not task_stop_controller.in_stop) or ( iterId % args.train_iter_gap == 0 ): is_forward = True if is_forward: loss, score = ForwardModelsTrain( args, task_cfg, device, task_count, task_iter_train, task_dataloader_train, model, task_losses, ) loss = loss * loss_scale if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps loss.backward() if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16: lr_this_step = args.learning_rate * warmup_linear( global_step / num_train_optimization_steps, args.warmup_proportion, ) for param_group in optimizer.param_groups: param_group["lr"] = lr_this_step if first_task and ( global_step < warmpu_steps or args.lr_scheduler == "warmup_linear" ): warmup_scheduler.step() optimizer.step() model.zero_grad() if first_task: global_step += 1 first_task = False if default_gpu: tbLogger.step_train( epochId, iterId, float(loss), float(score), optimizer.param_groups[0]["lr"], "train", ) if "cosine" in args.lr_scheduler and global_step > warmpu_steps: lr_scheduler.step() if ( step % (20 * args.gradient_accumulation_steps) == 0 and step != 0 and default_gpu ): tbLogger.showLossTrain() # decided whether to evaluate on SNLI tasks. if (iterId != 0 and iterId % task_num_iters == 0) or ( epochId == args.num_train_epochs - 1 and step == median_num_iter - 1 ): evaluate( args, task_dataloader_val, task_stop_controller, task_cfg, device, model, task_losses, epochId, default_gpu, tbLogger, ) if args.lr_scheduler == "automatic": lr_scheduler.step(sum(val_scores.values())) logger.info("best average score is %3f" % lr_scheduler.best) elif args.lr_scheduler == "mannul": lr_scheduler.step() if epochId in lr_reduce_list: # reset the task_stop_controller once the lr drop task_stop_controller._reset() if default_gpu: # Save a trained model logger.info("** ** * Saving fine - tuned model ** ** * ") model_to_save = ( model.module if hasattr(model, "module") else model ) # Only save the model it-self output_model_file = os.path.join( savePath, "pytorch_model_" + str(epochId) + ".bin" ) output_checkpoint = os.path.join(savePath, "pytorch_ckpt_latest.tar") torch.save(model_to_save.state_dict(), output_model_file) torch.save( { "model_state_dict": model_to_save.state_dict(), "optimizer_state_dict": optimizer.state_dict(), "warmup_scheduler_state_dict": warmup_scheduler.state_dict(), # 'lr_scheduler_state_dict': lr_scheduler.state_dict(), "global_step": global_step, "epoch_id": epochId, "task_stop_controller": task_stop_controller, "tb_logger": tbLogger, }, output_checkpoint, ) tbLogger.txt_close()
def train(args): save_path = join(args.save_path, 'ckpt') if not os.path.exists(save_path): os.makedirs(save_path) args.train_batch_size = args.bs * max(1, args.n_gpu) train_loader, val_loader = batcher(args.path, args.train_batch_size) t_total = len(train_loader) // args.gradient_accumulation_steps * args.num_train_epochs print(t_total / args.num_train_epochs) tb_writer = SummaryWriter(log_dir=join(args.save_path, 'tensorboard')) model = Bert_choice() if args.cuda: model = model.cuda() model.train() optimizer = AdamW(model.parameters(), lr=1e-5) scheduler = WarmupLinearSchedule(optimizer, warmup_steps=0, t_total=t_total) if args.fp16: from apex import amp model, optimizer = amp.initialize(model, optimizer, opt_level='O1') if args.n_gpu > 1: model = torch.nn.DataParallel(model) train_iterator = trange(int(args.num_train_epochs), desc="Epoch") global_ccr = 0 global_step = 0 for _ in train_iterator: epoch_iterator = tqdm(train_loader, desc="Iteration") tr_loss, logging_loss = 0, 0 tr_ccr = 0 for step, batch in enumerate(epoch_iterator): #questions, contexts, choicess = batch _inputs = batch.to(args.device) bs, cn, length = _inputs.size() labels = torch.tensor([0 for _ in range(bs)]).to(args.device) loss, _ids = model(_inputs, labels) ccr = sum([1 if _id == 0 else 0 for _id in _ids]) / len(_ids) if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu parallel training tr_loss += loss.item() tr_ccr += ccr / args.gradient_accumulation_steps if args.fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16: torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), 2) optimizer.step() scheduler.step() model.zero_grad() global_ccr = global_ccr * 0.01 + tr_ccr tb_writer.add_scalar('lr', scheduler.get_lr()[0], global_step) tb_writer.add_scalar('loss', (tr_loss - logging_loss), global_step) tb_writer.add_scalar('ccr', global_ccr, global_step) global_step += 1 #print('loss: {:.4f} ccr {:.4f}\r'.format(tr_loss, ccr), end='') logging_loss = tr_loss tr_ccr = 0 if global_step % args.ckpt == 0: total_ccr, total_loss = evaluate(model, val_loader, args) name = 'ckpt-{:4f}-{:4f}-{}'.format(total_loss, total_ccr, global_step) save_dict = {} save_dict['state_dict'] = model.state_dict() save_dict['optimizer'] = optimizer.state_dict() torch.save(save_dict, join(save_path, name))