def _create_optimizer(self, sgd): optimizer = AdamW( self._model.parameters(), lr=getattr(sgd, "pytt_lr", sgd.alpha), eps=sgd.eps, betas=(sgd.b1, sgd.b2), weight_decay=getattr(sgd, "pytt_weight_decay", 0.0), ) optimizer.zero_grad() return optimizer
def _create_optimizer(self, sgd): optimizer = AdamW( self._model.parameters(), lr=getattr(sgd, "pytt_lr", sgd.alpha), eps=sgd.eps, betas=(sgd.b1, sgd.b2), weight_decay=getattr(sgd, "pytt_weight_decay", 0.0), ) if getattr(sgd, "pytt_use_swa", False): optimizer = SWA(optimizer, swa_start=1, swa_freq=10, swa_lr=sgd.alpha) optimizer.zero_grad() return optimizer
def main(): parser = ArgumentParser() parser.add_argument('--pregenerated_data', type=Path, required=True) parser.add_argument('--output_dir', type=Path, required=True) parser.add_argument( "--bert_model", type=str, required=True, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese." ) parser.add_argument("--do_lower_case", action="store_true") parser.add_argument( "--reduce_memory", action="store_true", help= "Store training data as on-disc memmaps to massively reduce memory usage" ) parser.add_argument("--epochs", type=int, default=3, help="Number of epochs to train for") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument("--no_cuda", action='store_true', help="Whether not to use CUDA when available") parser.add_argument( '--gradient_accumulation_steps', type=int, default=1, help= "Number of updates steps to accumulate before performing a backward/update pass." ) parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.") parser.add_argument( '--fp16', action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument( '--loss_scale', type=float, default=0, help= "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.\n") parser.add_argument("--warmup_steps", default=0, type=int, help="Linear warmup over warmup_steps.") parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.") parser.add_argument("--learning_rate", default=3e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") args = parser.parse_args() assert args.pregenerated_data.is_dir(), \ "--pregenerated_data should point to the folder of files made by pregenerate_training_data.py!" samples_per_epoch = [] for i in range(args.epochs): epoch_file = args.pregenerated_data / f"epoch_{i}.json" metrics_file = args.pregenerated_data / f"epoch_{i}_metrics.json" if epoch_file.is_file() and metrics_file.is_file(): metrics = json.loads(metrics_file.read_text()) samples_per_epoch.append(metrics['num_training_examples']) else: if i == 0: exit("No training data was found!") print( f"Warning! There are fewer epochs of pregenerated data ({i}) than training epochs ({args.epochs})." ) print( "This script will loop over the available data, but training diversity may be negatively impacted." ) num_data_epochs = i break else: num_data_epochs = args.epochs if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logging.info( "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}". format(device, n_gpu, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(args.gradient_accumulation_steps)) args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if args.output_dir.is_dir() and list(args.output_dir.iterdir()): logging.warning( f"Output directory ({args.output_dir}) already exists and is not empty!" ) args.output_dir.mkdir(parents=True, exist_ok=True) tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) total_train_examples = 0 for i in range(args.epochs): # The modulo takes into account the fact that we may loop over limited epochs of data total_train_examples += samples_per_epoch[i % len(samples_per_epoch)] num_train_optimization_steps = int(total_train_examples / args.train_batch_size / args.gradient_accumulation_steps) if args.local_rank != -1: num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size( ) # Prepare model model = BertForMaskedLM.from_pretrained(args.bert_model) # We don't need to manually call model.half() following Apex's recommend # if args.fp16: # model.half() model.to(device) if args.local_rank != -1: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) model = DDP(model) elif n_gpu > 1: model = torch.nn.DataParallel(model) # Prepare optimizer param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) scheduler = WarmupLinearSchedule(optimizer, warmup_steps=args.warmup_steps, t_total=num_train_optimization_steps) if args.fp16: try: # from apex.optimizers import FP16_Optimizer # from apex.optimizers import FusedAdam from apex import amp except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) # This below line of code is the main upgrade of Apex Fp16 implementation. I chose opt_leve="01" # because it's recommended for typical use by Apex. We can make it configured model, optimizer = amp.initialize(model, optimizer, opt_level="O1") # We don't need to use FP16_Optimizer wrapping over FusedAdam as well. Now Apex supports all Pytorch Optimizer # optimizer = FusedAdam(optimizer_grouped_parameters, # lr=args.learning_rate, # bias_correction=False, # max_grad_norm=1.0) # if args.loss_scale == 0: # optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) # else: # optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale) # else: # optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) # scheduler = WarmupLinearSchedule(optimizer, warmup_steps=args.warmup_steps, t_total=num_train_optimization_steps) global_step = 0 logging.info("***** Running training *****") logging.info(f" Num examples = {total_train_examples}") logging.info(" Batch size = %d", args.train_batch_size) logging.info(" Num steps = %d", num_train_optimization_steps) model.train() for epoch in range(args.epochs): epoch_dataset = PregeneratedDataset( epoch=epoch, training_path=args.pregenerated_data, tokenizer=tokenizer, num_data_epochs=num_data_epochs, reduce_memory=args.reduce_memory) if args.local_rank == -1: train_sampler = RandomSampler(epoch_dataset) else: train_sampler = DistributedSampler(epoch_dataset) train_dataloader = DataLoader(epoch_dataset, sampler=train_sampler, batch_size=args.train_batch_size) tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 with tqdm(total=len(train_dataloader), desc=f"Epoch {epoch}") as pbar: for step, batch in enumerate(train_dataloader): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, lm_label_ids, is_next = batch outputs = model(input_ids, attention_mask=input_mask, masked_lm_labels=lm_label_ids) loss = outputs[0] if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: # I depricate FP16_Optimizer's backward func and replace as Apex document # optimizer.backward(loss) with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() tr_loss += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 pbar.update(1) mean_loss = tr_loss * args.gradient_accumulation_steps / nb_tr_steps pbar.set_postfix_str(f"Loss: {mean_loss:.5f}") if (step + 1) % args.gradient_accumulation_steps == 0: optimizer.step() scheduler.step() # Update learning rate schedule optimizer.zero_grad() global_step += 1 # Save a trained model if args.local_rank == -1 or torch.distributed.get_rank() == 0: logging.info("** ** * Saving fine-tuned model ** ** * ") model_to_save = model.module if hasattr( model, 'module') else model # Take care of distributed/parallel training model_to_save.save_pretrained(args.output_dir) tokenizer.save_pretrained(args.output_dir)
def train(self, train_category, dev_category, train_news, dev_news, tokenizer, Net=None, model=None): if os.path.exists(self.arguments.output_config_file) is True: os.remove(self.arguments.output_config_file) logger.info('>>train.shape: {} | dev.shape: {}'.format( train_category.shape, dev_category.shape)) train_dataloader, train_examples_len = Util.load_data( news=train_news, category=train_category, data_type='train', label_list=self.arguments.label_list, max_length=self.arguments.max_seq_length, tokenizer=tokenizer, batch_size=self.arguments.BATCH) dev_dataloader, dev_examples_len = Util.load_data( news=dev_news, category=dev_category, data_type='dev', label_list=self.arguments.label_list, max_length=self.arguments.max_seq_length, tokenizer=tokenizer, batch_size=self.arguments.BATCH) num_train_optimization_steps = int( train_examples_len / self.arguments.BATCH / self.arguments.gradient_accumulation_steps) * self.arguments.EPOCHS # 模型准备 logger.info("model name is {}".format(self.arguments.model_name)) if model is None: if self.arguments.model_name == "BertOrigin": model = Net.from_pretrained( pretrained_model_name_or_path=self.arguments. bert_model_dir, num_labels=self.arguments.num_labels, cache_dir=self.arguments.cache_dir) elif self.arguments.model_name == 'BertHAN': model = Net.from_pretrained( pretrained_model_name_or_path=self.arguments. bert_model_dir, num_labels=self.arguments.num_labels, cache_dir=self.arguments.cache_dir) elif self.arguments.model_name == "BertCNN": filter_sizes = [ int(val) for val in self.arguments.filter_sizes.split() ] model = Net.from_pretrained( pretrained_model_name_or_path=self.arguments. bert_model_dir, num_labels=self.arguments.num_labels, n_filters=self.arguments.filter_num, filter_sizes=filter_sizes, cache_dir=self.arguments.cache_dir) elif self.arguments.model_name == "BertATT": model = Net.from_pretrained( pretrained_model_name_or_path=self.arguments. bert_model_dir, num_labels=self.arguments.num_labels, cache_dir=self.arguments.cache_dir) elif self.arguments.model_name == "BertRCNN": model = Net.from_pretrained( pretrained_model_name_or_path=self.arguments. bert_model_dir, num_labels=self.arguments.num_labels, cache_dir=self.arguments.cache_dir, rnn_hidden_size=self.arguments.rnn_hidden_size, num_layers=self.arguments.num_layers, bidirectional=self.arguments.bidirectional, dropout=self.arguments.dropout) elif self.arguments.model_name == "BertCNNPlus": filter_sizes = [ int(val) for val in self.arguments.filter_sizes.split() ] model = Net.from_pretrained( pretrained_model_name_or_path=self.arguments. bert_model_dir, num_labels=self.arguments.num_labels, cache_dir=self.arguments.cache_dir, n_filters=self.arguments.filter_num, filter_sizes=filter_sizes) model.to(DEVICE) """ 优化器准备 """ param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) ], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] # To reproduce BertAdam specific behavior set correct_bias=False optimizer = AdamW(params=optimizer_grouped_parameters, lr=self.arguments.learning_rate, correct_bias=False) # PyTorch scheduler scheduler = WarmupLinearSchedule( optimizer=optimizer, warmup_steps=self.arguments.warmup_proportion, t_total=num_train_optimization_steps) """ 损失函数准备 """ if self.arguments.use_label_smoothing: criterion = NMTCriterion( label_smoothing=self.arguments.label_smoothing) else: criterion = nn.CrossEntropyLoss() criterion = criterion.to(DEVICE) best_auc, best_acc, global_step, early_stop_times = 0, 0, 0, 0 for epoch in range(int(self.arguments.EPOCHS)): if early_stop_times >= self.arguments.early_stop * ( train_examples_len // self.arguments.BATCH): break logger.info(f'---------------- Epoch: {epoch + 1:02} ----------') for step, batch in enumerate( tqdm(train_dataloader, desc="Iteration")): model.train() if self.arguments.label_smoothing: criterion.train() batch = tuple(t.to(DEVICE) for t in batch) _, input_ids, input_mask, segment_ids, label_ids = batch logits = model(input_ids, segment_ids, input_mask, labels=None) loss = criterion(inputs=logits, labels=label_ids, normalization=1.0, reduce=False) # 修正 if self.arguments.gradient_accumulation_steps > 1: loss = loss / self.arguments.gradient_accumulation_steps loss.backward(torch.ones_like(loss)) scheduler.step() if (step + 1) % self.arguments.gradient_accumulation_steps == 0: optimizer.step() optimizer.zero_grad() global_step += 1 if global_step % self.arguments.print_step == 0 and global_step != 0: dev_loss, dev_acc, dev_report, dev_auc = Util.evaluate( model, dev_dataloader, criterion, DEVICE, self.arguments.label_list, args=self.arguments) logger.info('\n>>>dev report: \n{}'.format(dev_report)) # 以 acc 取优 if dev_acc > best_acc: best_acc = dev_acc # 以 auc 取优 if dev_auc > best_auc: best_auc = dev_auc # 保存模型 model_to_save = model.module if hasattr( model, 'module') else model torch.save(model_to_save.state_dict(), self.arguments.output_model_file) with open(self.arguments.output_config_file, 'w') as f: f.write(model_to_save.config.to_json_string()) early_stop_times = 0 else: early_stop_times += 1 if os.path.exists(self.arguments.output_config_file) is False: model_to_save = model.module if hasattr(model, 'module') else model torch.save(model_to_save.state_dict(), self.arguments.output_model_file) with open(self.arguments.output_config_file, 'w') as f: f.write(model_to_save.config.to_json_string())
scheduler.load_state_dict(pretrained_dict_scheduler) start_iter_id = pretrained_dict['iterId'] del pretrained_dict, pretrained_dict_model, pretrained_dict_optimizer, pretrained_dict_scheduler, \ model_dict, optimizer_dict torch.cuda.empty_cache() num_iter_epoch = dataset.numDataPoints['train'] // (params['batch_size'] // params['sequences_per_image'] if (params['batch_size'] // params['sequences_per_image']) \ else 1 if not params['overfit'] else 5 ) print('\n%d iter per epoch.' % num_iter_epoch) dialog_encoder = nn.DataParallel(dialog_encoder) dialog_encoder.to(device) start_t = timer() optimizer.zero_grad() for epoch_id, idx, batch in batch_iter(dataloader, params): iter_id = start_iter_id + idx + (epoch_id * num_iter_epoch) dialog_encoder.train() # expand image features, orig_features = batch['image_feat'] orig_spatials = batch['image_loc'] orig_image_mask = batch['image_mask'] orig_image_target = batch['image_target'] orig_image_label = batch['image_label'] num_rounds = batch["tokens"].shape[1] num_samples = batch["tokens"].shape[2]
def main(): args = parse_args() # Devices if args.local_rank == -1: device = torch.device("cuda" if torch.cuda.is_available() else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 torch.distributed.init_process_group(backend="nccl") # Init distributed backend for sychronizing nodes/GPUs default_gpu = False if dist.is_available() and args.local_rank != -1: rank = dist.get_rank() if rank == 0: default_gpu = True else: default_gpu = True logger.info(f"device: {device} n_gpu: {n_gpu}, distributed training: {bool(args.local_rank != -1)}") # Load config config = BertConfig.from_json_file(args.config_file) # Output dirs timestamp = args.config_file.split("/")[1].split(".")[0] save_path = os.path.join(args.output_dir, timestamp) if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) if default_gpu: if not os.path.exists(save_path): os.makedirs(save_path) # save all the hidden parameters. with open(os.path.join(save_path, "command.txt"), "w") as f: print(args, file=f) # Python 3.x print("\n", file=f) print(config, file=f) cache = 5000 args.train_batch_size = args.train_batch_size // args.grad_acc_steps if dist.is_available() and args.local_rank != -1: num_replicas = dist.get_world_size() args.train_batch_size = args.train_batch_size // num_replicas args.num_workers = args.num_workers // num_replicas cache = cache // num_replicas # Seed random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) # Datasets tokenizer = AutoTokenizer.from_pretrained(config.bert_model, do_lower_case=config.do_lower_case) train_dataset = ConceptCapLoaderTrain(args.annotations_path, args.features_path, tokenizer, seq_len=args.max_seq_length, batch_size=args.train_batch_size, num_workers=args.num_workers, local_rank=args.local_rank, objective=args.objective, cache=cache, add_global_imgfeat=config.add_global_imgfeat, num_locs=config.num_locs) valid_dataset = ConceptCapLoaderVal(args.annotations_path, args.features_path, tokenizer, seq_len=args.max_seq_length, batch_size=args.train_batch_size, num_workers=2, objective=args.objective, add_global_imgfeat=config.add_global_imgfeat, num_locs=config.num_locs) # Task details task_names = ["Conceptual_Caption"] task_ids = ["TASK0"] task2num_iters = {"TASK0": train_dataset.num_dataset / args.train_batch_size} # Logging logdir = os.path.join(args.logdir, timestamp) if default_gpu: tb_logger = tbLogger(logdir, save_path, task_names, task_ids, task2num_iters, args.grad_acc_steps) else: tb_logger = None # Model if args.from_pretrained: type_vocab_size = config.type_vocab_size config.type_vocab_size = 2 model = BertForVLPreTraining.from_pretrained(args.from_pretrained, config=config, default_gpu=default_gpu, from_hf=True) # Resize type embeddings model.bert.embeddings.token_type_embeddings = \ model._get_resized_embeddings(model.bert.embeddings.token_type_embeddings, type_vocab_size) config.type_vocab_size = type_vocab_size else: model = BertForVLPreTraining(config) # Optimization details freeze_layers(model) no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"] bert_weight_name = json.load(open("config/" + args.from_pretrained + "_weight_name.json", "r")) if not args.from_pretrained: param_optimizer = list(model.named_parameters()) optimizer_grouped_parameters = [ {"params": [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], "weight_decay": args.weight_decay}, {"params": [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], "weight_decay": 0.0}, ] else: optimizer_grouped_parameters = [] for key, value in dict(model.named_parameters()).items(): if value.requires_grad: if key[12:] in bert_weight_name: lr = args.learning_rate * 0.1 else: lr = args.learning_rate if any(nd in key for nd in no_decay): optimizer_grouped_parameters += [{"params": [value], "lr": lr, "weight_decay": 0.0}] if not any(nd in key for nd in no_decay): optimizer_grouped_parameters += [{"params": [value], "lr": lr, "weight_decay": args.weight_decay}] if default_gpu: print(len(list(model.named_parameters())), len(optimizer_grouped_parameters)) optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon, betas=args.adam_betas) num_train_optimization_steps = int( train_dataset.num_dataset / args.train_batch_size / args.grad_acc_steps ) * args.num_train_epochs warmup_steps = args.warmup_steps or args.warmup_proportion * num_train_optimization_steps scheduler = WarmupLinearSchedule(optimizer, warmup_steps=warmup_steps, t_total=num_train_optimization_steps) # Resume training start_iter_id, global_step, start_epoch, tb_logger, _ = \ resume(args.resume_file, model, optimizer, scheduler, tb_logger) # Move to GPU(s) model.cuda() for state in optimizer.state.values(): for k, v in state.items(): if torch.is_tensor(v): state[k] = v.cuda() if args.local_rank != -1: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) model = DDP(model) elif n_gpu > 1: model = torch.nn.DataParallel(model) # Save starting model save(save_path, logger, -1, model, optimizer, scheduler, global_step, tb_logger, default_gpu, -1) # Print summary if default_gpu: summary_parameters(model, logger) logger.info("***** Running training *****") logger.info(" Num examples = %d", train_dataset.num_dataset) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_optimization_steps) # Train for epoch_id in range(start_epoch, int(args.num_train_epochs)): model.train() for step, batch in enumerate(train_dataset): iter_id = start_iter_id + step + (epoch_id * len(train_dataset)) batch = tuple(t.cuda(device=device, non_blocking=True) for t in batch[:-1]) input_ids, input_mask, segment_ids, lm_label_ids, is_match, \ image_feat, image_loc, image_cls, obj_labels, obj_confs, \ attr_labels, attr_confs, image_attrs, image_label, image_mask = batch if args.objective == 1: # Ignore labels (setting them to -1) for mismatched caption-image pairs image_label = image_label * (is_match == 0).long().unsqueeze(1) image_label[image_label == 0] = -1 lm_label_ids = lm_label_ids * (is_match == 0).long().unsqueeze(1) lm_label_ids[lm_label_ids == 0] = -1 masked_loss_t, masked_loss_v, pair_match_loss = model(input_ids, image_feat, image_loc, segment_ids, input_mask, image_mask, lm_label_ids, image_label, image_cls, obj_labels, obj_confs, attr_labels, attr_confs, image_attrs, is_match) if args.objective == 2: pair_match_loss = pair_match_loss * 0 loss = masked_loss_t + masked_loss_v + pair_match_loss if n_gpu > 1: loss = loss.mean() masked_loss_t = masked_loss_t.mean() masked_loss_v = masked_loss_v.mean() pair_match_loss = pair_match_loss.mean() if args.grad_acc_steps > 1: loss = loss / args.grad_acc_steps loss.backward() if (step + 1) % args.grad_acc_steps == 0: # Clip gradient if args.clip_grad_norm > 0: torch.nn.utils.clip_grad_norm_(model.parameters(), args.clip_grad_norm) optimizer.step() scheduler.step() optimizer.zero_grad() global_step += 1 if default_gpu: tb_logger.step_train_CC(epoch_id, iter_id, float(masked_loss_t), float(masked_loss_v), float(pair_match_loss), optimizer.param_groups[0]["lr"], "TASK0", "train") if (step % (20 * args.grad_acc_steps) == 0) and step != 0 and default_gpu: tb_logger.showLossTrainCC() # Do the evaluation torch.set_grad_enabled(False) numBatches = len(valid_dataset) model.eval() for step, batch in enumerate(valid_dataset): batch = tuple(t.cuda(device=device, non_blocking=True) for t in batch[:-1]) input_ids, input_mask, segment_ids, lm_label_ids, is_match, \ image_feat, image_loc, image_cls, obj_labels, obj_confs, \ attr_labels, attr_confs, image_attrs, image_label, image_mask = batch batch_size = input_ids.size(0) masked_loss_t, masked_loss_v, pair_match_loss = model(input_ids, image_feat, image_loc, segment_ids, input_mask, image_mask, lm_label_ids, image_label, image_cls, obj_labels, obj_confs, attr_labels, attr_confs, image_attrs, is_match) loss = masked_loss_t + masked_loss_v + pair_match_loss if n_gpu > 1: loss = loss.mean() masked_loss_t = masked_loss_t.mean() masked_loss_v = masked_loss_v.mean() pair_match_loss = pair_match_loss.mean() if default_gpu: tb_logger.step_val_CC(epoch_id, iter_id, float(masked_loss_t), float(masked_loss_v), float(pair_match_loss), "TASK0", batch_size, "val") sys.stdout.write("%d / %d \r" % (step, numBatches)) sys.stdout.flush() if default_gpu: tb_logger.showLossValCC() torch.set_grad_enabled(True) save(save_path, logger, epoch_id, model, optimizer, scheduler, global_step, tb_logger, default_gpu, loss) if default_gpu: tb_logger.txt_close()
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument("--data_dir", default=None, type=str, required=False, help="The input data dir. Should contain the .tsv files (or other data files) for the task.") parser.add_argument("--bert_model", default=None, type=str, required=False, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, " "bert-base-multilingual-cased, bert-base-chinese.") parser.add_argument("--task_name", default=None, type=str, required=True, help="The name of the task to train.") parser.add_argument("--output_dir", default=None, type=str, required=False, help="The output directory where the model predictions and checkpoints will be written.") ## Other parameters parser.add_argument("--cache_dir", default="", type=str, help="Where do you want to store the pre-trained models downloaded from s3") parser.add_argument("--max_seq_length", default=128, type=int, help="The maximum total input sequence length after WordPiece tokenization. \n" "Sequences longer than this will be truncated, and sequences shorter \n" "than this will be padded.") parser.add_argument("--do_train", action='store_true', help="Whether to run training.") parser.add_argument("--do_eval", action='store_true', help="Whether to run eval on the dev set.") parser.add_argument("--do_lower_case", action='store_true', help="Set this flag if you are using an uncased model.") parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.") parser.add_argument("--eval_batch_size", default=8, type=int, help="Total batch size for eval.") parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument("--warmup_proportion", default=0.1, type=float, help="Proportion of training to perform linear learning rate warmup for. " "E.g., 0.1 = 10%% of training.") parser.add_argument("--no_cuda", action='store_true', help="Whether not to use CUDA when available") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument('--gradient_accumulation_steps', type=int, default=1, help="Number of updates steps to accumulate before performing a backward/update pass.") parser.add_argument('--fp16', action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument('--loss_scale', type=float, default=0, help="Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.\n") parser.add_argument('--server_ip', type=str, default='', help="Can be used for distant debugging.") parser.add_argument('--server_port', type=str, default='', help="Can be used for distant debugging.") args = parser.parse_args() processors = {"ner":NerProcessor} if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() if args.gradient_accumulation_steps < 1: raise ValueError("Invalid gradient_accumulation_steps parameter: {}, should be >= 1".format( args.gradient_accumulation_steps)) args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) task_name = args.task_name.lower() if task_name not in processors: raise ValueError("Task not found: %s" % (task_name)) processor = processors[task_name]() label_list = processor.get_labels() num_labels = len(label_list)# + 1 #consider the 0 for padded label pretrain_model_dir = 'bert-base-uncased' tokenizer = BertTokenizer.from_pretrained(pretrain_model_dir, do_lower_case=args.do_lower_case) train_examples = None num_train_optimization_steps = None train_examples = processor.get_negation_train_examples('/export/home/Dataset/negation/starsem-st-2012-data/cd-sco/corpus/training/SEM-2012-SharedTask-CD-SCO-training-09032012.txt') num_train_optimization_steps = int( len(train_examples) / args.train_batch_size / args.gradient_accumulation_steps) * args.num_train_epochs if args.local_rank != -1: num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size() model = NegationModel.from_pretrained(pretrain_model_dir, # cache_dir=cache_dir, num_labels = num_labels) model.to(device) param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [ {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01}, {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} ] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate) global_step = 0 nb_tr_steps = 0 tr_loss = 0 # label_map = {i : label for i, label in enumerate(label_list,1)} if args.do_train: train_features = convert_examples_to_features( train_examples, label_list, args.max_seq_length, tokenizer) logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_examples)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_optimization_steps) all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long) all_cue_label_ids = torch.tensor([f.cue_label_ids for f in train_features], dtype=torch.long) all_scope_label_ids = torch.tensor([f.scope_label_ids for f in train_features], dtype=torch.long) all_valid_ids = torch.tensor([f.valid_ids for f in train_features], dtype=torch.long) all_lmask_ids = torch.tensor([f.label_mask for f in train_features], dtype=torch.long) train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_cue_label_ids, all_scope_label_ids,all_valid_ids,all_lmask_ids) train_sampler = RandomSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) '''load test data''' eval_examples = processor.get_negation_test_examples('/export/home/Dataset/negation/starsem-st-2012-data/cd-sco/corpus/test-gold', ['SEM-2012-SharedTask-CD-SCO-test-cardboard-GOLD.txt', 'SEM-2012-SharedTask-CD-SCO-test-circle-GOLD.txt']) eval_features = convert_examples_to_features(eval_examples, label_list, args.max_seq_length, tokenizer) logger.info("***** Running evaluation *****") logger.info(" Num examples = %d", len(eval_examples)) logger.info(" Batch size = %d", args.eval_batch_size) all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long) all_cue_label_ids = torch.tensor([f.cue_label_ids for f in eval_features], dtype=torch.long) all_scope_label_ids = torch.tensor([f.scope_label_ids for f in eval_features], dtype=torch.long) all_valid_ids = torch.tensor([f.valid_ids for f in eval_features], dtype=torch.long) all_lmask_ids = torch.tensor([f.label_mask for f in eval_features], dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_cue_label_ids, all_scope_label_ids,all_valid_ids,all_lmask_ids) eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) model.train() for _ in trange(int(args.num_train_epochs), desc="Epoch"): nb_tr_examples, nb_tr_steps = 0, 0 for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration")): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, cue_label_ids, scope_label_ids, valid_ids,l_mask = batch loss_cue, loss_scope = model(input_ids, segment_ids, input_mask, cue_label_ids, scope_label_ids,valid_ids,l_mask) loss = loss_cue + loss_scope if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps loss.backward() tr_loss += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 if (step + 1) % args.gradient_accumulation_steps == 0: optimizer.step() optimizer.zero_grad() global_step += 1 print('\nmean loss:', tr_loss/global_step) '''testing''' model.eval() eval_loss, eval_accuracy = 0, 0 nb_eval_steps, nb_eval_examples = 0, 0 y_true_cue = [] y_pred_cue = [] y_true_scope = [] y_pred_scope = [] label_map = {i : label for i, label in enumerate(label_list)} for input_ids, input_mask, segment_ids, cue_label_ids,scope_label_ids,valid_ids,l_mask in tqdm(eval_dataloader, desc="Evaluating"): input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) valid_ids = valid_ids.to(device) cue_label_ids = cue_label_ids.to(device) scope_label_ids = scope_label_ids.to(device) l_mask = l_mask.to(device) with torch.no_grad(): ''' model(input_ids, segment_ids, input_mask, cue_label_ids, scope_label_ids,valid_ids,l_mask) ''' logits_cue, logits_scope = model(input_ids, segment_ids, input_mask,valid_ids=valid_ids,attention_mask_label=l_mask) task = 0 for logits, label_ids in zip([logits_cue, logits_scope], [cue_label_ids, scope_label_ids]): '''we do not want the predicted max label index is 0''' logits = nn.Sigmoid()(logits)# torch.argmax(F.log_softmax(logits,dim=2),dim=2) #(batch, max_len) logits = logits.detach().cpu().numpy() label_ids = label_ids.to('cpu').numpy()#(batch, max_len) # l_mask = l_mask.to('cpu').numpy()#(batch, max_len) for i, label in enumerate(label_ids): '''each sentence''' temp_1 = [] # gold temp_2 = [] # pred for j,m in enumerate(label): '''each word''' if j == 0: # is a pad continue elif l_mask[i][j] == 0: '''this means the gold label is [SEP], the end of sent''' if task == 0: y_true_cue.append(temp_1) y_pred_cue.append(temp_2) else: y_true_scope.append(temp_1) y_pred_scope.append(temp_2) break else: temp_1.append(label_map[label_ids[i][j]]) temp_2.append('1' if logits[i][j][0]>0.3 else '0') task+=1 # print('y_pred_cue:', y_pred_cue) # print('y_true_cue:', y_true_cue) f1_cue = 0.0 for true_cue_list, pred_cue_list in zip(y_true_cue, y_pred_cue): f1_cue+=f1_score(true_cue_list, pred_cue_list, pos_label='1') f1_cue/=len(y_true_cue) print('\ncue f1:', f1_cue) f1_scope = 0.0 for true_scope_list, pred_scope_list in zip(y_true_scope, y_pred_scope): f1_scope+=f1_score(true_scope_list, pred_scope_list, pos_label='1') f1_scope/=len(y_true_scope) print('scope f1:', f1_scope,'\n')
def do_train(self, model, dataloader): param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) ], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] optimizer = AdamW(optimizer_grouped_parameters, lr=self.args.learning_rate) # initilize results epochs, best_epoch = 0, 0 min_or_max = 'min' if self.args.KeyEval in ['Loss'] else 'max' best_valid = 1e8 if min_or_max == 'min' else 0 while (epochs - best_epoch < self.args.early_stop): epochs += 1 # train y_pred, y_true = [], [] losses = [] model.train() train_loss = 0.0 left_epochs = self.args.update_epochs with tqdm(dataloader['train']) as td: for batch_data in td: if left_epochs == self.args.update_epochs: optimizer.zero_grad() left_epochs -= 1 text = batch_data['text'].to(self.args.device) audio = batch_data['audio'].to(self.args.device) vision = batch_data['vision'].to(self.args.device) labels = batch_data['labels']['M'].squeeze().to( self.args.device) # forward outputs = model(text, audio, vision) logits = outputs[0].squeeze() # compute loss if self.args.output_mode == "classification": loss = self.classification_criterion( logits.view(-1, self.args.num_labels), labels.view(-1)) elif self.args.output_mode == "regression": loss = self.regression_criterion( logits.view(-1), labels.view(-1)) # backward loss.backward() # store results train_loss += loss.item() y_pred.append(logits.cpu()) y_true.append(labels.cpu()) if not left_epochs: optimizer.step() left_epochs = self.args.update_epochs if not left_epochs: # update optimizer.step() train_loss = train_loss / len(dataloader['train']) print("TRAIN-(%s) (%d/%d/%d)>> loss: %.4f " % (self.args.modelName, \ epochs-best_epoch, epochs, self.args.cur_time, train_loss)) pred, true = torch.cat(y_pred), torch.cat(y_true) train_results = self.metrics(pred, true, exclude_zero=self.args.excludeZero) print('%s: >> ' % (self.args.tasks) + dict_to_str(train_results)) # validation val_results = self.do_test(model, dataloader['valid'], mode="VAL") cur_valid = val_results[self.args.tasks[0]][self.args.KeyEval] # save best model isBetter = cur_valid <= ( best_valid - 1e-6) if min_or_max == 'min' else cur_valid >= ( best_valid + 1e-6) if isBetter: best_valid, best_epoch = cur_valid, epochs model_path = os.path.join(self.args.model_save_path,\ f'{self.args.modelName}-{self.args.datasetName}-{self.args.tasks}.pth') if os.path.exists(model_path): os.remove(model_path) # save model torch.save(model.cpu().state_dict(), model_path) model.to(self.args.device) print('save model in %s...' % model_path) self.do_test(model, dataloader['test'], mode="TEST")
def fit(self, X, y, dev=None): tokens, masks = self.tokenize(X) tokens = torch.LongTensor(tokens) masks = torch.LongTensor(masks) y = torch.FloatTensor(y) train_data = TensorDataset(tokens, y, masks) # compute class and instance weights for sampling # for the hierarchical task the weights are computed per column group minmax_ratio = int(os.environ.get('SAMPLING_MINMAX_RATIO', -1)) if self.hierarchical: class_weights = y.sum(dim=0) for lvl, idx in self.label_hierarchy.items(): weights_ = class_weights[idx] if weights_.min() == weights_.max(): class_weights[idx] = minmax_ratio / 2 continue if minmax_ratio == -1: minmax_ratio = int(weights_.max() / weights_.min()) inverted = (weights_.max() - weights_) + 1 class_weights_ = (((inverted - inverted.min()) * (minmax_ratio - 1)) / (inverted.max() - inverted.min())) + 1 class_weights[idx] = class_weights_ instance_weights = (y * class_weights).mean(dim=1) else: counts = y.sum(dim=0) + 1 if minmax_ratio == -1: minmax_ratio = int(counts.max() / counts.min()) inverted = (counts.max() - counts) + 1 class_weights = (((inverted - inverted.min()) * (minmax_ratio - 1)) / (inverted.max() - inverted.min())) + 1 instance_weights = (y * class_weights).max(dim=1)[0] train_sampler = WeightedRandomSampler(instance_weights, len(X), replacement=True) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=self.batch_size) if self.hierarchical: model = BertForHierarchicalMultilabelSequenceClassification.from_pretrained( self.model_name, num_labels=len(y[0])) model.set_hierarchy(self.label_hierarchy) else: model = BertForMultilabelSequenceClassification.from_pretrained( self.model_name, num_labels=len(y[0])) model.loss = self.loss param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) ], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] #optimizer_grouped_parameters = [{"params": [p for n, p in param_optimizer]}] num_total_steps = self.n_epochs * (len(train_dataloader.sampler) // self.batch_size // self.gradient_accumulation_steps) num_warmup_steps = int(num_total_steps * 0.15) # To reproduce BertAdam specific behavior set correct_bias=False optimizer = AdamW(optimizer_grouped_parameters, lr=3e-5, correct_bias=False) # PyTorch scheduler scheduler = WarmupLinearSchedule(optimizer, warmup_steps=num_warmup_steps, t_total=num_total_steps) DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu') self.model = model.to(DEVICE) minmax_ratio = int(os.environ.get('LOSS_MINMAX_RATIO', -1)) if self.hierarchical: class_weights = y.sum(dim=0) + 1 for lvl, idx in self.label_hierarchy.items(): weights_ = class_weights[idx] if weights_.min() == weights_.max(): class_weights[idx] = minmax_ratio / 2 continue if minmax_ratio == -1: minmax_ratio = int(weights_.max() / weights_.min()) inverted = weights_.max() - weights_ class_weights_ = (((inverted - inverted.min()) * (minmax_ratio - 1)) / (inverted.max() - inverted.min())) + 1 class_weights[idx] = class_weights_ instance_weights = (y * class_weights).mean(dim=1) else: counts = y.sum(dim=0) + 1 if minmax_ratio == -1: minmax_ratio = int(counts.max() / counts.min()) inverted = counts.max() - counts class_weights = (((inverted - inverted.min()) * (minmax_ratio - 1)) / (inverted.max() - inverted.min())) + 1 instance_weights = (y * class_weights).max(dim=1)[0] class_weights = class_weights.to(DEVICE) i_step = 1 # stop pylint from complaining start_time = time.time() gradient_accumulation_steps = self.gradient_accumulation_steps epochs = trange(self.n_epochs, desc="Epoch") for i_epoch in epochs: steps = tqdm(train_dataloader, total=len(train_dataloader.sampler) // train_dataloader.batch_size + 1, desc='Mini-batch') train_loss = 0 batch_loss = 0 self.model = model.train() for i_step, batch in enumerate(steps): batch = (b.to(DEVICE) for b in batch) batch_input, batch_targets, batch_masks = batch loss, *_ = model(batch_input, label_hierarchy=self.label_hierarchy, labels=batch_targets, attention_mask=batch_masks, class_weights=class_weights) loss = loss / gradient_accumulation_steps loss.backward() batch_loss += loss.item() train_loss += loss.item() if (gradient_accumulation_steps <= 1 or (i_step + 1) % gradient_accumulation_steps == 0): batch_loss = batch_loss / self.gradient_accumulation_steps steps.set_postfix_str( f'loss {batch_loss:.4f} || ' f'avg. loss {train_loss / (i_step + 1):.4f}') with open('loss.txt', 'a') as fh: fh.write( f'batch\t{i_step}\t{batch_loss:.10f}\ttrain\n') optimizer.step() scheduler.step() optimizer.zero_grad() batch_loss = 0 if callable(self.post_epoch_hook): self.post_epoch_hook(self, i_epoch, dev) with open('loss.txt', 'a') as fh: fh.write( f'epoch\t{i_epoch}\t{train_loss / i_step:.10f}\ttrain\n') steps.close() epochs.set_postfix_str(f'avg. loss {train_loss / i_step:.4f}') self.model = model.to('cpu') return self
def train(args, device, model, tokenizer): ''' Create a logger and tensorboard writer ''' logger = logging.getLogger(__name__) tb_writer = SummaryWriter(log_dir=args.tsbd_dir) ''' Create a training dataset and dataloader ''' train_dataset, num_labels = model_utils.load_and_cache_examples(args, tokenizer) train_sampler = RandomSampler(train_dataset) train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size, num_workers=args.num_cpu_workers, pin_memory=True) print("Length of the dataloader is ", len(train_dataloader)) num_train_optimization_steps = len(train_dataloader) // \ args.gradient_accumulation_steps * args.num_train_epochs print("Number of the total training steps = ", num_train_optimization_steps) ''' Create an optimizer and a scheduler instance ''' # Below is a little complicated - # they changed the implementation of the BertAdam to make AdamW without # any gradient clipping so now you have to do your own. # Read details at the bottom of readme at # https://github.com/huggingface/pytorch-transformers#migrating-from-pytorch-pretrained-bert-to-pytorch-transformers no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] param_optimizer = list(model.named_parameters()) optimizer_grouped_parameters = [ {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': args.weight_decay}, {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} ] args.warmup_steps = args.warmup_proportion * num_train_optimization_steps optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, correct_bias=False) # eps = args.adam_epsilon (can define); correct_bias can be set # to false like in the original tensorflow repository if args.scheduler == 'WarmupLinearSchedule': scheduler = WarmupLinearSchedule(optimizer, warmup_steps=args.warmup_steps, t_total=num_train_optimization_steps) if args.scheduler == 'ReduceLROnPlateau': scheduler = lr_scheduler.ReduceLROnPlateau(optimizer, threshold=1e-6) ''' Log some important training parameters ''' logger.info("***** Running training *****") logger.info(" Data split file: %s", args.data_split_path) logger.info(" Data split mode: %s", args.data_split_mode) logger.info(" Training fold = %s\t Validation fold = %s"%(args.training_folds, args.validation_folds)) logger.info(" Num examples = %d", len(train_dataset)) logger.info(" Num Epochs = %d", args.num_train_epochs) logger.info(" Total train batch size = %d", args.train_batch_size) logger.info(" Gradient Accumulation steps = %d", args.gradient_accumulation_steps) logger.info(" Total optimization steps = %d", num_train_optimization_steps) logger.info(" Initial learning rate = %f", args.learning_rate) logger.info(" Learning rate scheduler = %s", args.scheduler) logger.info(" Number of output channels = %s", num_labels) ''' Train the model ''' global_step = 0 tr_loss, logging_loss = 0.0, 0.0 last_epoch_loss = 0.0 last_epoch_global_step = 0 logging_img_loss, logging_txt_loss, logging_joint_loss = 0.0, 0.0, 0.0 last_epoch_img_loss, last_epoch_txt_loss, last_epoch_joint_loss = 0.0, 0.0, 0.0 tr_img_loss, tr_txt_loss, tr_joint_loss = 0.0, 0.0, 0.0 # https://discuss.pytorch.org/t/why-do-we-need-to-set-the-gradients-manually-to-zero-in-pytorch/4903/7 # to check the purpose of zero-ing out the gradients between minibatches train_iterator = trange(int(args.num_train_epochs), desc="Epoch") model.train() for epoch in train_iterator: epoch_iterator = tqdm(train_dataloader, desc="Iteration") tr_epoch_loss = 0 for step, batch in enumerate(epoch_iterator): batch = tuple(t.to(device=device, non_blocking=True) for t in batch) image, label_raw, txt_ids, txt_mask, txt_segment_ids, \ label_onehot_or_ordinal, report_id = batch # label_raw is always 0-3 and # label_onehot_or_ordinal is one-hot or ordinal # depending on if it's multiclass or multilabel # report_id is the radiology report study ID that's unique to each report inputs = { 'input_img': image, 'input_ids': txt_ids, 'attention_mask': txt_mask, 'token_type_ids': txt_segment_ids, 'labels': None, 'bert_pool_last_hidden': args.bert_pool_last_hidden, 'bert_pool_use_img': args.bert_pool_use_img, 'bert_pool_img_lowerlevel': args.bert_pool_img_lowerlevel} outputs = model(**inputs) img_embedding, img_logits, txt_embedding, txt_logits = outputs[:4] # Model outputs are always tuple in pytorch-transformers (see doc) ''' Adjust the cross entropy loss function for different label encoding options ''' if args.output_channel_encoding == 'multilabel' and \ args.training_mode != 'semisupervised_phase1': label_ordinal = label_onehot_or_ordinal # Replace the image label with the ordinally encoded label BCE_loss_criterion = BCEWithLogitsLoss() img_loss = BCE_loss_criterion(img_logits.view(-1, num_labels), label_ordinal.view(-1, num_labels).float()) txt_loss = BCE_loss_criterion(txt_logits.view(-1, num_labels), label_ordinal.view(-1, num_labels).float()) elif args.output_channel_encoding == 'multiclass' and \ args.training_mode != 'semisupervised_phase1': label = label_raw CrossEntropyCriterion = CrossEntropyLoss() # In this case, softmax is added in the model # and the CrossEntropyCriterion only accepts raw labels 0-3 img_loss = CrossEntropyCriterion(img_logits.view(-1, num_labels), label.view(-1).long()) txt_loss = CrossEntropyCriterion(txt_logits.view(-1, num_labels), label.view(-1).long()) ''' Define loss functions ''' if args.joint_loss_method == 'l2': joint_loss_criterion = torch.nn.MSELoss() joint_loss = joint_loss_criterion(img_embedding, txt_embedding) elif args.joint_loss_method == 'cosine': joint_loss_criterion = torch.nn.CosineEmbeddingLoss() y = torch.ones(img_embedding.shape[0], device=device) y.requires_grad = False joint_loss = joint_loss_criterion(x1=img_embedding, x2=txt_embedding, y=y) # y is ones so the joint loss is the negative inverse of cosine elif args.joint_loss_method == 'dot': joint_loss = custom_loss.dot_product_loss(img_embedding, txt_embedding) elif args.joint_loss_method == 'ranking': joint_loss = custom_loss.ranking_loss( img_embedding, txt_embedding, label_raw, report_id, similarity_function=args.joint_loss_similarity_function) if args.training_mode == 'supervised' or \ args.training_mode == 'semisupervised_phase2': loss = img_loss+txt_loss+joint_loss if args.training_mode == 'semisupervised_phase1': loss = joint_loss img_loss = joint_loss txt_loss = joint_loss # img_loss and txt_loss will not be computed and optimized # in the training mode of semisupervised_phase1 if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps loss.backward() if 'grad' in optimizer_grouped_parameters: torch.nn.utils.clip_grad_norm_(optimizer_grouped_parameters, args.max_grad_norm) ''' Run optimizer and log loss terms during training ''' tr_loss += loss.item() tr_epoch_loss += loss.item() tr_img_loss += img_loss.item() tr_txt_loss += txt_loss.item() tr_joint_loss += joint_loss.item() if epoch == args.num_train_epochs - 1: last_epoch_loss += loss.item() last_epoch_img_loss += img_loss.item() last_epoch_txt_loss += txt_loss.item() last_epoch_joint_loss += joint_loss.item() if (step + 1) % args.gradient_accumulation_steps == 0: optimizer.step() # important: Pytorch 0.1 and above needs optimizer step to happen before # see https://pytorch.org/docs/stable/optim.html#how-to-adjust-learning-rate # and notice this open bug with LR scheduler https://github.com/pytorch/pytorch/issues/22107 if args.scheduler == 'WarmupLinearSchedule': scheduler.step() # Update learning rate scheduler optimizer.zero_grad() global_step += 1 if epoch == args.num_train_epochs -1: last_epoch_global_step += 1 if args.logging_steps > 0 and global_step % args.logging_steps == 0: tb_writer.add_scalar('learning_rate', optimizer.param_groups[0]['lr'], global_step) tb_writer.add_scalar('loss/train', (tr_loss - logging_loss)/args.logging_steps, global_step) tb_writer.add_scalar('loss_img/train', (tr_img_loss - logging_img_loss)/args.logging_steps, global_step) tb_writer.add_scalar('loss_txt/train', (tr_txt_loss - logging_txt_loss)/args.logging_steps, global_step) tb_writer.add_scalar('loss_joint/train', (tr_joint_loss - logging_joint_loss)/args.logging_steps, global_step) logger.info(" [%d, %5d, %5d] learning rate = %.7f"%\ (epoch + 1, step + 1, global_step, optimizer.param_groups[0]['lr'])) logger.info(" [%d, %5d, %5d] loss = %.5f"%\ (epoch + 1, step + 1, global_step, (tr_loss - logging_loss)/args.logging_steps)) logger.info(" [%d, %5d, %5d] joint loss = %.5f"%\ (epoch + 1, step + 1, global_step, (tr_joint_loss - logging_joint_loss)/args.logging_steps)) logger.info(" [%d, %5d, %5d] image loss = %.5f"%\ (epoch + 1, step + 1, global_step, (tr_img_loss - logging_img_loss)/args.logging_steps)) logger.info(" [%d, %5d, %5d] text loss = %.5f"%\ (epoch + 1, step + 1, global_step, (tr_txt_loss - logging_txt_loss)/args.logging_steps)) logging_loss = tr_loss logging_img_loss = tr_img_loss logging_txt_loss = tr_txt_loss logging_joint_loss = tr_joint_loss if args.scheduler == 'ReduceLROnPlateau': scheduler.step(tr_epoch_loss) # Update learning rate scheduler ''' Save model checkpoint ''' if args.save_epochs > 0 and (epoch + 1) % args.save_epochs == 0: output_dir = os.path.join(args.checkpoints_dir, 'checkpoint-{}'.format(epoch+1)) if not os.path.exists(output_dir): os.makedirs(output_dir) model_to_save = model.module if hasattr(model, 'module') else model # Take care of distributed/parallel training model_to_save.save_pretrained(output_dir) logger.info(" Epoch %d loss = %.5f" % (epoch + 1, tr_epoch_loss)) return {'global_step': global_step, 'training_loss': tr_loss / global_step, 'training_img_loss': tr_img_loss / global_step, 'training_txt_loss': tr_txt_loss / global_step, 'training_joint_loss': tr_joint_loss / global_step, 'last_epoch_training_loss': last_epoch_loss / last_epoch_global_step, 'last_epoch_img_loss': last_epoch_img_loss / last_epoch_global_step, 'last_epoch_txt_loss': last_epoch_txt_loss / last_epoch_global_step, 'last_epoch_joint_loss': last_epoch_joint_loss / last_epoch_global_step}
def main(): batch_size = 32 max_seq_len = 128 n_epochs = 3 bert_model = 'bert-base-uncased' learning_rate = 3e-5 adam_epsilon = 1e-8 warmup_steps = 0 num_labels = 1 output_dir = "fine_tuned--{0}--SEQ_LEN={1}--BATCH_SIZE={2}--HEAD={3}".format( bert_model, max_seq_len, batch_size, num_labels) dataset_dir = "dataset\custom_training_set.csv" device = torch.device("cuda" if torch.cuda.is_available() else "cpu") config = BertConfig.from_pretrained(bert_model) config.num_labels = num_labels tokenizer = BertTokenizer.from_pretrained(bert_model) model = BertForSequenceClassification(config) model.to(device) train_dataset = Dataset(dataset_dir, tokenizer, max_seq_len) num_train_optimization_steps = int( len(train_dataset) / batch_size) * n_epochs param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] optimizer = AdamW(optimizer_grouped_parameters, lr=learning_rate, eps=adam_epsilon) scheduler = WarmupLinearSchedule(optimizer, warmup_steps=warmup_steps, t_total=num_train_optimization_steps) train_sampler = data.RandomSampler(train_dataset) train_dataloader = data.DataLoader(train_dataset, sampler=train_sampler, batch_size=batch_size) model.train() for _ in trange(n_epochs, desc="Epoch"): for batch in tqdm(train_dataloader, desc="Iteration"): batch = (t.to(device) for t in batch) input_ids, input_mask, segment_ids, labels = batch outputs = model(input_ids, input_mask, segment_ids, labels) loss = outputs[0] loss.backward() scheduler.step() optimizer.step() optimizer.zero_grad() if not os.path.exists(output_dir): os.mkdir(output_dir) model.save_pretrained(output_dir) tokenizer.save_pretrained(output_dir)
def main(): parser = ArgumentParser() parser.add_argument('--pregenerated_data', type=Path, required=True) parser.add_argument('--output_dir', type=Path, required=True) parser.add_argument( "--bert_model", type=str, default='bert-base-uncased', help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese." ) parser.add_argument("--do_lower_case", action="store_true") parser.add_argument( "--reduce_memory", action="store_true", help= "Store training data as on-disc memmaps to massively reduce memory usage" ) parser.add_argument("--epochs", type=int, default=3, help="Number of epochs to train for") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument("--no_cuda", action='store_true', help="Whether not to use CUDA when available") parser.add_argument( '--gradient_accumulation_steps', type=int, default=1, help= "Number of updates steps to accumulate before performing a backward/update pass." ) parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.") parser.add_argument( '--fp16', action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument( '--loss_scale', type=float, default=0, help= "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.\n") parser.add_argument("--warmup_steps", default=0, type=int, help="Linear warmup over warmup_steps.") parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.") parser.add_argument("--learning_rate", default=3e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") args = parser.parse_args() assert args.pregenerated_data.is_dir(), \ "--pregenerated_data should point to the folder of files made by pregenerate_training_data.py!" samples_per_epoch = [] for i in range(args.epochs): epoch_file = args.pregenerated_data / f"epoch_{i}.json" metrics_file = args.pregenerated_data / f"epoch_{i}_metrics.json" if epoch_file.is_file() and metrics_file.is_file(): metrics = json.loads(metrics_file.read_text()) samples_per_epoch.append(metrics['num_training_examples']) else: if i == 0: exit("No training data was found!") print( f"Warning! There are fewer epochs of pregenerated data ({i}) than training epochs ({args.epochs})." ) print( "This script will loop over the available data, but training diversity may be negatively impacted." ) num_data_epochs = i break else: num_data_epochs = args.epochs if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logging.info( "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}". format(device, n_gpu, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(args.gradient_accumulation_steps)) args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if args.output_dir.is_dir() and list(args.output_dir.iterdir()): logging.warning( f"Output directory ({args.output_dir}) already exists and is not empty!" ) args.output_dir.mkdir(parents=True, exist_ok=True) tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True) total_train_examples = 0 for i in range(args.epochs): # The modulo takes into account the fact that we may loop over limited epochs of data total_train_examples += samples_per_epoch[i % len(samples_per_epoch)] num_train_optimization_steps = int(total_train_examples / args.train_batch_size / args.gradient_accumulation_steps) if args.local_rank != -1: num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size( ) # Prepare model # model = BertForMaskedLM.from_pretrained('bert-base-uncased', # cache_dir=os.path.join(str(PYTORCH_PRETRAINED_BERT_CACHE), 'distributed_{}'.format(args.local_rank))) model = BertForMaskedLM.from_pretrained('bert-base-uncased') model.to(device) if n_gpu > 1: model = torch.nn.DataParallel(model) # Prepare optimizer param_optimizer = list(model.named_parameters()) param_optimizer = [n for n in param_optimizer] no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] # optimizer = BertAdam(optimizer_grouped_parameters, # lr=args.learning_rate, # warmup=args.warmup_proportion, # t_total=num_train_optimization_steps) optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) scheduler = WarmupLinearSchedule(optimizer, warmup_steps=args.warmup_steps, t_total=num_train_optimization_steps) global_step = 0 logging.info("***** Running training *****") logging.info(f" Num examples = {total_train_examples}") logging.info(" Batch size = %d", args.train_batch_size) logging.info(" Num steps = %d", num_train_optimization_steps) model.train() all_losses = [] for epoch in range(args.epochs): epoch_dataset = PregeneratedDataset( epoch=epoch, training_path=args.pregenerated_data, tokenizer=tokenizer, num_data_epochs=num_data_epochs, reduce_memory=args.reduce_memory) if args.local_rank == -1: train_sampler = RandomSampler(epoch_dataset) else: train_sampler = DistributedSampler(epoch_dataset) train_dataloader = DataLoader(epoch_dataset, sampler=train_sampler, batch_size=args.train_batch_size) tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 with tqdm(total=len(train_dataloader), desc=f"Epoch {epoch}") as pbar: for step, batch in enumerate(train_dataloader): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, lm_label_ids = batch outputs = model(input_ids, segment_ids, input_mask, lm_label_ids) loss = outputs[0] if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: optimizer.backward(loss) else: loss.backward() tr_loss += loss.item() all_losses.append(loss.item()) nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 pbar.update(1) mean_loss = tr_loss * args.gradient_accumulation_steps / nb_tr_steps pbar.set_postfix_str(f"Loss: {mean_loss:.5f}") if (step + 1) % args.gradient_accumulation_steps == 0: scheduler.step() # Update learning rate schedule optimizer.step() optimizer.zero_grad() global_step += 1 if step % 100 == 0: plt.plot(all_losses) plt.savefig(args.output_dir / 'losses.png') plt.close() # Save a trained model after each epoch #if n_gpu > 1 and torch.distributed.get_rank() == 0 or n_gpu <=1 : logging.info( "** ** * Saving fine-tuned model after epoch %d ** ** * " % epoch) model.save_pretrained(args.output_dir) tokenizer.save_pretrained(args.output_dir)
def main(): parser = ArgumentParser() parser.add_argument('--pregenerated_data', type=Path, default='training/') parser.add_argument('--output_dir', type=Path, default='./model') parser.add_argument("--epochs", type=int, default=24000, help="Number of epochs to train for") parser.add_argument("--no_cuda", default=False) parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument('--gradient_accumulation_steps', type=int, default=2, help="Number of updates steps to accumulate before performing a backward/update pass.") parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.") parser.add_argument('--fp16', default=False, help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument('--loss_scale', type=float, default=0, help="Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.\n") parser.add_argument("--warmup_steps", default=0, type=int, help="Linear warmup over warmup_steps.") parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.") parser.add_argument("--learning_rate", default=3e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") args = parser.parse_args() assert args.pregenerated_data.is_dir(), \ "--pregenerated_data should point to the folder of files made by pregenerate_training_data.py!" samples_per_epoch = [] for i in range(args.epochs): epoch_file = args.pregenerated_data / f"epoch_{i}.json" metrics_file = args.pregenerated_data / f"epoch_{i}_metrics.json" if epoch_file.is_file() and metrics_file.is_file(): metrics = json.loads(metrics_file.read_text()) samples_per_epoch.append(metrics['num_training_examples']) else: if i == 0: exit("No training data was found!") print(f"Warning! There are fewer epochs of pregenerated data ({i}) than training epochs ({args.epochs}).") print("This script will loop over the available data, but training diversity may be negatively impacted.") num_data_epochs = i break else: num_data_epochs = args.epochs if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logging.info("device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".format( device, n_gpu, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError("Invalid gradient_accumulation_steps parameter: {}, should be >= 1".format( args.gradient_accumulation_steps)) args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if args.output_dir.is_dir() and list(args.output_dir.iterdir()): logging.warning(f"Output directory ({args.output_dir}) already exists and is not empty!") args.output_dir.mkdir(parents=True, exist_ok=True) tokenizer = Tokenizer(sp_path) total_train_examples = 0 for i in range(args.epochs): # The modulo takes into account the fact that we may loop over limited epochs of data total_train_examples += samples_per_epoch[i % len(samples_per_epoch)] num_train_optimization_steps = int( total_train_examples / args.train_batch_size / args.gradient_accumulation_steps) if args.local_rank != -1: num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size() # Prepare model model = BertForPreTraining.from_pretrained(model_path) model.to(device) # Prepare optimizer param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [ {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01}, {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} ] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) scheduler = WarmupLinearSchedule(optimizer, warmup_steps=args.warmup_steps, t_total=num_train_optimization_steps) global_step = 0 logging.info("***** Running training *****") logging.info(f" Num examples = {total_train_examples}") logging.info(f" Batch size = {args.train_batch_size}") logging.info(f" Num steps = {num_train_optimization_steps}") model.train() for epoch in range(args.epochs): epoch_dataset = PregeneratedDataset(epoch=epoch, training_path=args.pregenerated_data, tokenizer=tokenizer, num_data_epochs=num_data_epochs) if args.local_rank == -1: train_sampler = RandomSampler(epoch_dataset) else: train_sampler = DistributedSampler(epoch_dataset) train_dataloader = DataLoader(epoch_dataset, sampler=train_sampler, batch_size=args.train_batch_size) tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 with tqdm(total=len(train_dataloader), desc=f"Epoch {epoch}") as pbar: for step, batch in enumerate(train_dataloader): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, lm_label_ids, is_next = batch outputs = model(input_ids, segment_ids, input_mask, lm_label_ids, is_next) loss = outputs[0] if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps loss.backward() tr_loss += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 pbar.update(1) mean_loss = tr_loss * args.gradient_accumulation_steps / nb_tr_steps pbar.set_postfix_str(f"Loss: {mean_loss:.5f}") writer.add_scalar('Loss/train', mean_loss, (epoch + 1) * step) if (step + 1) % args.gradient_accumulation_steps == 0: optimizer.step() scheduler.step(epoch) optimizer.zero_grad() global_step += 1 # Save a trained model if args.local_rank == -1 or torch.distributed.get_rank() == 0: logging.info("** ** * Saving fine-tuned model ** ** * ") model_to_save = model.module if hasattr(model, 'module') else model model_to_save.save_pretrained(args.output_dir) tokenizer.save_pretrained(args.output_dir)