def compute_importance(model, parallel_model, updater, dataloaders, loss_type="l2"): """Mimic the depoloyment setup where the model is applied on some samples and those are used to update the importance params Uses the L2norm of the function output. This is what we MAS uses as default """ # model.eval() # Set model to training mode so we get the gradient # train_loss_fct = DataParallelCriterion(CrossEntropyLoss(ignore_index=FILL_VAL), args.device_ids) softmax = torch.nn.Softmax(dim=-1) if loss_type == "l2": loss_fct = DataParallelCriterion(torch.nn.MSELoss(reduction='mean'), args.device_ids) elif loss_type == "l1": loss_fct = DataParallelCriterion(torch.nn.L1Loss(reduction='mean'), args.device_ids) elif loss_type == "ewc": CELoss = CrossEntropyLoss(ignore_index=FILL_VAL, reduction='mean', weight=TOKEN_WEIGHT) loss_fct = DataParallelCriterion(CELoss, args.device_ids) # Iterate over data. for dataloader in dataloaders: for cq, len_cq, cqa, len_cqa, Y, _, _ in dataloader: # get the inputs n_inputs = sum(len(_cq) for _cq in cq) for i in range(len(cqa)): cq[i] = (cq[i].to(args.device_ids[i]),) len_cq[i] = len_cq[i].to(args.device_ids[i]) cqa[i] = (cqa[i].to(args.device_ids[i]),) len_cqa[i] = len_cqa[i].to(args.device_ids[i]) Y[i] = Y[i].to(args.device_ids[i]) # zero the parameter gradients updater.zero_grad() # forward if loss_type != "ewc": logits = parallel_model(cq) logits = [logit[range(len(logit)), len_cq[i]-1, :] for i, logit in enumerate(logits)] #logits = [softmax(logit, dim=-1) for logit in logits] target_zeros = [torch.zeros(logit.size()).to(args.device_ids[i]) for i, logit in enumerate(logits)] logits = [softmax(logit) for logit in logits] if loss_type == "l2": targets = loss_fct(logits, target_zeros) elif loss_type == "l1": targets = loss_fct(logits, target_zeros) else: targets, _ = get_losses(parallel_model, cqa, Y, None, None, loss_fct) targets /= n_inputs #compute the gradients targets.backward() #update the parameters importance updater.step(model.reg_params, n_inputs)
def CrossEntropyLoss(self, logit, target): n, c, h, w = logit.size() criterion = nn.CrossEntropyLoss(weight=self.weight, ignore_index=self.ignore_index, size_average=self.size_average) if self.cuda and len(self.args.gpu_ids) > 1: criterion = DataParallelCriterion(criterion) if self.cuda and len(self.args.gpu_ids) < 2: criterion = criterion.cuda() loss = criterion(logit, target.long()) if self.batch_average: loss /= n return loss
def createModels(args, userNum, itemNum): if args.model == 'SPUIGACF': model = SPUIGACF(userNum, itemNum, embedSize=args.embedSize, layers=args.layers, droprate=args.droprate).cuda() elif args.model == 'SPUIMultiGACF': model = SPUIMultiGACF(userNum, itemNum, embedSize=args.embedSize, layers=args.layers, droprate=args.droprate).cuda() elif args.model == 'SPUIGAGPCF': model = SPUIGAGPCF(userNum, itemNum, adj, embedSize=args.embedSize, layers=args.layers, droprate=args.droprate).cuda() if args.train_mode == 'PairSampling': lossfn = BPRLoss() if args.parallel == True: model = DataParallelModel(model) lossfn = DataParallelCriterion2(lossfn) elif args.train_mode == 'NegSampling': lossfn = BCEWithLogitsLoss() if args.parallel == True: model = DataParallelModel(model) # 并行化model lossfn = DataParallelCriterion(lossfn) # 并行化损失函数 optim = Adam(model.parameters(), lr=args.lr, weight_decay=args.weight_decay) return model, lossfn, optim
def __init__(self, model, mask_prob: float = 0.15, clip: int = 1, optimizer=None): self.model = model self.clip = clip self.optimizer = optimizer self.device = torch.device( "cuda" if torch.cuda.is_available() else "cpu") self.model = self.model.to(self.device) self.mask_prob = mask_prob self.criterion = nn.NLLLoss( ignore_index=model.text_processor.pad_token_id()) num_gpu = torch.cuda.device_count() if num_gpu > 1: print("Let's use", num_gpu, "GPUs!") self.model = DataParallelModel(self.model) self.criterion = DataParallelCriterion(self.criterion) self.best_dev_loss = float("inf") self.best_train_loss = float("inf") self.last_train_loss = float("inf")
def __init__(self, cfg: Namespace, data: Dataset): """ Args: cfg: configuration data: train dataset """ self.cfg = cfg self.train, self.valid = data.split(0.8) RATING_FIELD.build_vocab(self.train) self.device = torch.device( 'cuda') if torch.cuda.is_available() else torch.device('cpu') # pylint: disable=no-member self.batch_size = cfg.batch_size if torch.cuda.is_available(): self.batch_size *= torch.cuda.device_count() self.trn_itr = BucketIterator( self.train, device=self.device, batch_size=self.batch_size, shuffle=True, train=True, sort_within_batch=True, sort_key=lambda exam: -len(exam.comment_text)) self.vld_itr = BucketIterator( self.valid, device=self.device, batch_size=self.batch_size, shuffle=False, train=False, sort_within_batch=True, sort_key=lambda exam: -len(exam.comment_text)) self.log_step = 1000 if len(self.vld_itr) < 100: self.log_step = 10 elif len(self.vld_itr) < 1000: self.log_step = 100 bert_path = cfg.bert_path if cfg.bert_path else 'bert-base-cased' self.model = BertForSequenceClassification.from_pretrained( bert_path, num_labels=2) pos_weight = ( len([exam for exam in self.train.examples if exam.target < 0.5]) / len([exam for exam in self.train.examples if exam.target >= 0.5])) pos_wgt_tensor = torch.tensor([1.0, pos_weight], device=self.device) # pylint: disable=not-callable self.criterion = nn.CrossEntropyLoss(weight=pos_wgt_tensor) if torch.cuda.is_available(): self.model = DataParallelModel(self.model.cuda()) self.criterion = DataParallelCriterion(self.criterion) self.optimizer = optim.Adam(self.model.parameters(), cfg.learning_rate)
def train(): os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3" device = torch.device("cuda" if torch.cuda.is_available() else "cpu") loaders = create_datasets(num_workers=32, batch_size=600) # info = pd.read_csv("./flower_data/train.csv")[["image","label"]] # class_weights = torch.tensor(1.0/info.groupby(["label"]).count().values.astype(np.float32)) # del info models_ensamble = [ # {"name":"vgg", "model":models.vgg16_bn(pretrained=True)}, {"name":"resnet", "model":models.resnet50(pretrained=True)}, # {"name":"densenet", "model":models.densenet121(pretrained=True) }, {"name":"resnet", "model":models.resnet101(pretrained=True) }, ] # model = Ensemble(models_ensamble, name="star_ensemble") model = load_checkpoint("ensemble_iso_star_5118.pt") ft, cl =model.get_parameters() # model = nn.DataParallel(model) model = DataParallelModel(model) model = model.to(device) weight = torch.from_numpy(weight_train[0]).to(device) criterion = nn.NLLLoss(weight) criterion = DataParallelCriterion(criterion) optimizers = [ optim.Adam(ft, lr=5e-4), optim.Adam(cl, lr=5e-3)] # # print("") # # print('-' * 40) # # print("lr = {} bs= {}".format(lr,bs) ) # # print('-' * 40) # # Decay LR by a factor of 0.1 every 7 epochs exp_lr_schedulers = [lr_scheduler.StepLR(optimizers[0], step_size = 1, gamma = 0.995), lr_scheduler.StepLR(optimizers[1], step_size = 1, gamma = 0.992) ] model = [model, criterion, optimizers, exp_lr_schedulers, device] model = train_model(*model, loaders, num_epochs = 100)
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument("--train_corpus", default=None, type=str, required=True, help="The input train corpus.") parser.add_argument( "--bert_model", default=None, type=str, required=True, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese." ) parser.add_argument( "--output_dir", default=None, type=str, required=True, help="The output directory where the model checkpoints will be written." ) ## Other parameters parser.add_argument( "--max_seq_length", default=128, type=int, help= "The maximum total input sequence length after WordPiece tokenization. \n" "Sequences longer than this will be truncated, and sequences shorter \n" "than this will be padded.") parser.add_argument("--do_train", action='store_true', help="Whether to run training.") parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.") parser.add_argument("--learning_rate", default=3e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument( "--warmup_proportion", default=0.1, type=float, help= "Proportion of training to perform linear learning rate warmup for. " "E.g., 0.1 = 10%% of training.") parser.add_argument("--no_cuda", action='store_true', help="Whether not to use CUDA when available") parser.add_argument( "--on_memory", action='store_true', help="Whether to load train samples into memory or use disk") parser.add_argument( "--do_lower_case", action='store_true', help= "Whether to lower case the input text. True for uncased models, False for cased models." ) parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument( '--gradient_accumulation_steps', type=int, default=1, help= "Number of updates steps to accumualte before performing a backward/update pass." ) parser.add_argument( '--fp16', action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument( '--loss_scale', type=float, default=0, help= "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.\n") parser.add_argument('--discriminative_finetuning', action='store_true', help='Whether to use discriminative fine-tuning') args = parser.parse_args() if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logger.info( "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}". format(device, n_gpu, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(args.gradient_accumulation_steps)) args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if not args.do_train: raise ValueError( "Training is currently the only implemented execution option. Please set `do_train`." ) if os.path.exists(args.output_dir) and os.listdir(args.output_dir): raise ValueError( "Output directory ({}) already exists and is not empty.".format( args.output_dir)) if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) #train_examples = None num_train_optimization_steps = None if args.do_train: print("Loading Train Dataset", args.train_corpus) train_dataset = BERTDataset(args.train_corpus, tokenizer, seq_len=args.max_seq_length, corpus_lines=None, on_memory=args.on_memory) num_train_optimization_steps = int( len(train_dataset) / args.train_batch_size / args.gradient_accumulation_steps) * args.num_train_epochs if args.local_rank != -1: num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size( ) # Prepare model ############################################################################# # model = BertForPreTraining.from_pretrained(args.bert_model) model = BertForMaskedLM.from_pretrained(args.bert_model) if args.fp16: model.half() model.to(device) if args.local_rank != -1: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) model = DDP(model) elif n_gpu > 1: # model = torch.nn.DataParallel(model) model = DataParallelModel(model) # Prepare optimizer if args.do_train: param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] if args.discriminative_finetuning: group1 = ['layer.0', 'layer.1.'] group2 = ['layer.2', 'layer.3'] group3 = ['layer.4', 'layer.5'] group4 = ['layer.6', 'layer.7'] group5 = ['layer.8', 'layer.9'] group6 = ['layer.10', 'layer.11'] group_all = ['layer.0', 'layer.1', 'layer.2', 'layer.3', 'layer.4', 'layer.5', \ 'layer.6', 'layer.7', 'layer.8', 'layer.9', 'layer.10', 'layer.11'] optimizer_grouped_parameters = [ {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay) and not any(nd in n for nd in group_all)], \ 'weight_decay': 0.01}, {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay) and any(nd in n for nd in group1)], \ 'weight_decay': 0.01, 'lr': args.learning_rate/2.6**5}, {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay) and any(nd in n for nd in group2)], \ 'weight_decay': 0.01, 'lr': args.learning_rate/2.6**4}, {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay) and any(nd in n for nd in group3)], \ 'weight_decay': 0.01, 'lr': args.learning_rate/2.6**3}, {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay) and any(nd in n for nd in group4)], \ 'weight_decay': 0.01, 'lr': args.learning_rate/2.6**2}, {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay) and any(nd in n for nd in group5)], \ 'weight_decay': 0.01, 'lr': args.learning_rate/2.6}, {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay) and any(nd in n for nd in group6)], \ 'weight_decay': 0.01, 'lr': args.learning_rate}, {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay) and not any(nd in n for nd in group_all)], \ 'weight_decay': 0.0}, {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay) and any(nd in n for nd in group1)], \ 'weight_decay': 0.0, 'lr': args.learning_rate/2.6**5}, {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay) and any(nd in n for nd in group2)], \ 'weight_decay': 0.0, 'lr': args.learning_rate/2.6**4}, {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay) and any(nd in n for nd in group3)], \ 'weight_decay': 0.0, 'lr': args.learning_rate/2.6**3}, {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay) and any(nd in n for nd in group4)], \ 'weight_decay': 0.0, 'lr': args.learning_rate/2.6**2}, {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay) and any(nd in n for nd in group5)], \ 'weight_decay': 0.0, 'lr': args.learning_rate/2.6}, {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay) and any(nd in n for nd in group6)], \ 'weight_decay': 0.0, 'lr': args.learning_rate}, ] else: optimizer_grouped_parameters = [{ 'params': [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) ], 'weight_decay': 0.01 }, { 'params': [ p for n, p in param_optimizer if any(nd in n for nd in no_decay) ], 'weight_decay': 0.0 }] if args.fp16: try: from apex.optimizers import FP16_Optimizer from apex.optimizers import FusedAdam except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) optimizer = FusedAdam(optimizer_grouped_parameters, lr=args.learning_rate, bias_correction=False, max_grad_norm=1.0) if args.loss_scale == 0: optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) else: optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale) warmup_linear = WarmupLinearSchedule( warmup=args.warmup_proportion, t_total=num_train_optimization_steps) else: optimizer = BertAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=num_train_optimization_steps) global_step = 0 if args.do_train: logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_dataset)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_optimization_steps) if args.local_rank == -1: train_sampler = RandomSampler(train_dataset) else: #TODO: check if this works with current data generator from disk that relies on next(file) # (it doesn't return item back by index) train_sampler = DistributedSampler(train_dataset) train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size, drop_last=True) model.train() for _ in trange(int(args.num_train_epochs), desc="Epoch"): tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 for step, batch in enumerate( tqdm(train_dataloader, desc="Iteration")): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, lm_label_ids, is_next = batch logits = model(input_ids, segment_ids, input_mask) loss_fct = CrossEntropyLoss(ignore_index=-1) loss_fct = DataParallelCriterion(loss_fct) logits = [ logits[i].view(-1, model.module.config.vocab_size) for i in range(len(logits)) ] loss = loss_fct(logits, lm_label_ids.view(-1)) # loss = model(input_ids, segment_ids, input_mask, lm_label_ids, is_next) # loss = model(input_ids, segment_ids, input_mask, lm_label_ids) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: optimizer.backward(loss) else: loss.backward() tr_loss += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16: # modify learning rate with special warm up BERT uses # if args.fp16 is False, BertAdam is used that handles this automatically lr_this_step = args.learning_rate * warmup_linear.get_lr( global_step, args.warmup_proportion) for param_group in optimizer.param_groups: param_group['lr'] = lr_this_step optimizer.step() optimizer.zero_grad() global_step += 1 # Save a trained model logger.info("** ** * Saving fine - tuned model ** ** * ") model_to_save = model.module if hasattr( model, 'module') else model # Only save the model it-self output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME) output_config_file = os.path.join(args.output_dir, CONFIG_NAME) if args.do_train: torch.save(model_to_save.state_dict(), output_model_file) model_to_save.config.to_json_file(output_config_file) tokenizer.save_vocabulary(args.output_dir)
def createModels(args, userNum, itemNum, adj): if args.model == 'NCF': model = NCF(userNum, itemNum, 64, layers=[128, 64, 32, 16, 8]).cuda() if args.model == 'NMF': model = NMF(args.model, userNum, itemNum, 3, args.embedSize, args.droprate).cuda() elif args.model == 'NGCFMF': model = NGCFMF(userNum, itemNum, adj, embedSize=args.embedSize, layers=args.layers).cuda() elif args.model == 'NGCFMLP': model = NGCFMLP(userNum, itemNum, adj, embedSize=args.embedSize, layers=args.layers).cuda() elif args.model == 'NGCFMFMLP': model = NGCFMFMLP(userNum, itemNum, adj, embedSize=args.embedSize, layers=args.layers).cuda() elif args.model == 'NGCFMF_concat_MF': model = NGCFMF_concat_MF(userNum, itemNum, adj, embedSize=args.embedSize, layers=args.layers).cuda() elif args.model == 'NGCFMF_concat_MLP': model = NGCFMF_concat_MLP(userNum, itemNum, adj, embedSize=args.embedSize, layers=args.layers).cuda() elif args.model == 'NGCFMLP_concat_MF': model = NGCFMLP_concat_MF(userNum, itemNum, adj, embedSize=args.embedSize, layers=args.layers).cuda() elif args.model == 'NGCFMLP_concat_MLP': model = NGCFMLP_concat_MLP(userNum, itemNum, adj, embedSize=args.embedSize, layers=args.layers).cuda() elif args.model == 'NGCFMF_concat_MF_MLP': model = NGCFMF_concat_MF_MLP(userNum, itemNum, adj, embedSize=args.embedSize, layers=args.layers).cuda() elif args.model == 'NGCFMLP_concat_MF_MLP': model = NGCFMLP_concat_MF_MLP(userNum, itemNum, adj, embedSize=args.embedSize, layers=args.layers).cuda() elif args.model == 'GACFV1': model = GACFV1(userNum, itemNum, adj, embedSize=args.embedSize, layers=args.layers, droprate=args.droprate).cuda() elif args.model == 'GACFV2': model = GACFV2(userNum, itemNum, adj, embedSize=args.embedSize, layers=args.layers, droprate=args.droprate).cuda() elif args.model == 'GACFMask': model = GACFMask(userNum, itemNum, adj, embedSize=args.embedSize, layers=args.layers, droprate=args.droprate).cuda() elif args.model == 'SPGA': model = SPGACF(userNum, itemNum, adj, embedSize=args.embedSize, layers=args.layers, droprate=args.droprate).cuda() elif args.model == 'GACFV3': model = GACFV3(userNum, itemNum, adj, embedSize=args.embedSize, layers=args.layers, droprate=args.droprate).cuda() elif args.model == 'GACFV4': model = GACFV4(userNum, itemNum, adj, embedSize=args.embedSize, layers=args.layers, droprate=args.droprate).cuda() elif args.model == 'GACFV5': model = GACFV5(userNum, itemNum, adj, embedSize=args.embedSize, layers=args.layers, droprate=args.droprate).cuda() elif args.model == 'GACFV6': model = GACFV6(userNum, itemNum, adj, embedSize=args.embedSize, layers=args.layers, droprate=args.droprate).cuda() if args.train_mode == 'PairSampling': lossfn = BPRLoss() if args.parallel == True: model = DataParallelModel(model) lossfn = DataParallelCriterion2(lossfn) elif args.train_mode == 'NegSampling': lossfn = BCEWithLogitsLoss() if args.parallel == True: model = DataParallelModel(model) # 并行化model lossfn = DataParallelCriterion(lossfn) # 并行化损失函数 optim = Adam(model.parameters(), lr=args.lr, weight_decay=args.weight_decay) return model, lossfn, optim
def main(args): init(args) # Constants n_ctx = args.n_ctx save_dir = os.path.join(args.output_dir, args.experiment_name, "checkpoints") desc = args.desc data_dir = args.data_dir log_dir = os.path.join(args.output_dir, args.experiment_name, "logs") train_log_interval = args.train_log_interval val_log_interval = args.val_log_interval beam = args.beam gen_len = args.gen_len k = args.k decoding_strategy = args.decoding_strategy accum_iter = args.accum_iter device = torch.device("cuda" if torch.cuda.is_available() else "cpu") n_gpu = torch.cuda.device_count() print("device", device, "n_gpu", n_gpu) logger = Logger(log_dir) text_encoder = TextEncoder(args.encoder_path, args.vocab_path) encoder = text_encoder.encoder n_vocab = len(text_encoder.encoder) encoder['_start_'] = len(encoder) encoder['_delimiter_'] = len(encoder) encoder['_classify_'] = len(encoder) clf_token = encoder['_classify_'] n_special = 3 print("Loading dataset...") train_loader = get_loader(os.path.join(data_dir, "train_encoded.jsonl"), args.n_batch, encoder, num_workers=3, shuffle=True) val_loader = get_loader(os.path.join(data_dir, "val_encoded.jsonl"), n_gpu, encoder, num_workers=0, shuffle=False, max_size=args.num_val_examples) print("Train length: {}, Validation length: {}".format(len(train_loader), len(val_loader))) vocab = n_vocab + n_special + n_ctx n_updates_total = (len(train_loader) // args.accum_iter) * (args.num_epochs_dat + args.num_epochs_ft) dh_model = LMModel(args, vocab=vocab, n_ctx=n_ctx, doc_embed=args.doc_model) criterion = nn.CrossEntropyLoss(reduction="none") model_opt = OpenAIAdam(dh_model.parameters(), lr=args.lr, schedule=args.lr_schedule, warmup=args.lr_warmup, t_total=n_updates_total, b1=args.b1, b2=args.b2, e=args.e, l2=args.l2, vector_l2=args.vector_l2, max_grad_norm=args.max_grad_norm) lm_loss = LMLoss(criterion) summary_loss = SummaryLoss(criterion) print("Loading Model") if args.use_pretrain: load_openai_pretrained_model(dh_model.transformer, n_ctx=n_ctx, n_special=n_special, path="./model/", path_names="./") start_iter, running_loss = load_checkpoint(args.checkpoint, dh_model, model_opt, vocab, n_ctx) dh_model.to(device) dh_model = DataParallelModel(dh_model) lm_loss = DataParallelCriterion(lm_loss) summary_loss = DataParallelCriterion(summary_loss) for i in range(args.num_epochs_dat): start_iter, running_loss = run_epoch(start_iter, running_loss, dh_model, lm_loss, model_opt, train_loader, val_loader, train_log_interval, val_log_interval, device, beam, gen_len, k, decoding_strategy, accum_iter, "DAT Training Epoch [{}/{}]".format(i + 1, args.num_epochs_dat), save_dir, logger, text_encoder, show_progress=args.show_progress, summary_loss=summary_loss) for i in range(args.num_epochs_ft): start_iter, running_loss = run_epoch(start_iter, running_loss, dh_model, summary_loss, model_opt, train_loader, val_loader, train_log_interval, val_log_interval, device, beam, gen_len, k, decoding_strategy, accum_iter, "FT Training Epoch [{}/{}]".format(i + 1, args.num_epochs_ft), save_dir, logger, text_encoder, show_progress=args.show_progress)
warmup=args.lr_warmup, t_total=n_updates_total, b1=args.b1, b2=args.b2, e=args.e, l2=args.l2, vector_l2=args.vector_l2, max_grad_norm=args.max_grad_norm) load_openai_pretrained_model(dh_model.transformer, n_ctx=n_ctx, n_special=n_special) dh_model.to(device) dh_model = DataParallelModel(dh_model) criterion_lm = DataParallelCriterion(criterion_lm) criterion_clf = DataParallelCriterion(criterion_clf) n_updates = 0 n_epochs = 0 if submit: path = os.path.join(save_dir, desc, 'best_params_para_selector') torch.save(dh_model.state_dict(), make_path(path)) best_score = 0 for i in range(args.n_iter): if i == 0: log_msmarco() print("running epoch", i) run_epoch() n_epochs += 1 # log(save_dir, desc)
def createModels(args, userNum, itemNum, rt): if args.model == 'NCF': model = NCF(userNum, itemNum, 64, layers=[128, 64, 32, 16, 8]).cuda() elif args.model == 'GCF': model = GCF(userNum, itemNum, rt, embedSize=args.embedSize, layers=args.layers).cuda() elif args.model == 'GACFV1': model = GACFV1(userNum, itemNum, rt, embedSize=args.embedSize, layers=args.layers, droprate=args.droprate).cuda() elif args.model == 'GACFV2': model = GACFV2(userNum, itemNum, rt, embedSize=args.embedSize, layers=args.layers, droprate=args.droprate).cuda() elif args.model == 'GACFV3': model = GACFV2(userNum, itemNum, rt, embedSize=args.embedSize, layers=args.layers, droprate=args.droprate).cuda() elif args.model == 'GACFV4': model = GACFV4(userNum, itemNum, rt, embedSize=args.embedSize, layers=args.layers, droprate=args.droprate).cuda() elif args.model == 'GACFV5': model = GACFV5(userNum, itemNum, rt, embedSize=args.embedSize, layers=args.layers, droprate=args.droprate).cuda() elif args.model == 'GACFV6': model = GACFV6(userNum, itemNum, rt, embedSize=args.embedSize, layers=args.layers, droprate=args.droprate).cuda() # model = SVD(userNum,itemNum,50).cuda() # model = NCF(userNum,itemNum,64,layers=[128,64,32,16,8]).cuda() if args.evaluate == 'MSE': lossfn = MSELoss() elif args.evaluate == 'RANK': lossfn = BCEWithLogitsLoss() if args.parallel == True: model = DataParallelModel(model) # 并行化model lossfn = DataParallelCriterion(lossfn) # 并行化损失函数 optim = Adam(model.parameters(), lr=args.lr, weight_decay=args.weight_decay) return model, lossfn, optim
# tar = target.contiguous().view(-1) # out = output.contiguous().view(target.size(0),-1) target = tar.contiguous().view(-1) output = out[:tar.size(0)] normalize = output.size(0) * output.size(1) output = output.contiguous().view(target.size(0), -1) loss = self.NLL(output, target) / normalize return loss if not eval_model: criterion = NLLLoss(ignore_index=PAD) parallel_model = DataParallelModel(model) # Encapsulate the model parallel_loss = DataParallelCriterion(criterion) # In[5]: # --------------------------- # def merge_res(res): # ((inds1, log_probs1, enc_out1),(inds2, log_probs2, enc_out2)) = res # inds = T.cat([inds1, inds2], dim = 0).cpu() # enc_out = T.cat([enc_out1, enc_out2], dim = 0).cpu() # if type(log_probs1) != list: # log_probs = T.cat([log_probs1, log_probs2], dim = 0) # return inds, log_probs, enc_out # else: # return inds, _, enc_out
def train(config): net = BertForMaskedLM.from_pretrained(config.model) lossFunc = KLDivLoss(config) if torch.cuda.is_available(): net = net.cuda() lossFunc = lossFunc.cuda() if config.dataParallel: net = DataParallelModel(net) lossFunc = DataParallelCriterion(lossFunc) options = optionsLoader(LOG, config.optionFrames, disp=False) Tokenizer = BertTokenizer.from_pretrained(config.model) prepareFunc = prepare_data trainSet = Dataset('train', config.batch_size, lambda x: len(x[0]) + len(x[1]), prepareFunc, Tokenizer, options['dataset'], LOG, 'train') validSet = Dataset('valid', config.batch_size, lambda x: len(x[0]) + len(x[1]), prepareFunc, Tokenizer, options['dataset'], LOG, 'valid') print(trainSet.__len__()) Q = [] best_vloss = 1e99 counter = 0 lRate = config.lRate prob_src = config.prob_src prob_tgt = config.prob_tgt num_train_optimization_steps = trainSet.__len__( ) * options['training']['stopConditions']['max_epoch'] param_optimizer = list(net.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] optimizer = BertAdam(optimizer_grouped_parameters, lr=lRate, e=1e-9, t_total=num_train_optimization_steps, warmup=0.0) for epoch_idx in range(options['training']['stopConditions']['max_epoch']): total_seen = 0 total_similar = 0 total_unseen = 0 total_source = 0 trainSet.setConfig(config, prob_src, prob_tgt) trainLoader = data.DataLoader(dataset=trainSet, batch_size=1, shuffle=True, num_workers=config.dataLoader_workers, pin_memory=True) validSet.setConfig(config, 0.0, prob_tgt) validLoader = data.DataLoader(dataset=validSet, batch_size=1, shuffle=False, num_workers=config.dataLoader_workers, pin_memory=True) for batch_idx, batch_data in enumerate(trainLoader): if (batch_idx + 1) % 10000 == 0: gc.collect() start_time = time.time() net.train() inputs, positions, token_types, labels, masks, batch_seen, batch_similar, batch_unseen, batch_source = batch_data inputs = inputs[0].cuda() positions = positions[0].cuda() token_types = token_types[0].cuda() labels = labels[0].cuda() masks = masks[0].cuda() total_seen += batch_seen total_similar += batch_similar total_unseen += batch_unseen total_source += batch_source n_token = int((labels.data != 0).data.sum()) predicts = net(inputs, positions, token_types, masks) loss = lossFunc(predicts, labels, n_token).sum() Q.append(float(loss)) if len(Q) > 200: Q.pop(0) loss_avg = sum(Q) / len(Q) optimizer.zero_grad() loss.backward() optimizer.step() LOG.log( 'Epoch %2d, Batch %6d, Loss %9.6f, Average Loss %9.6f, Time %9.6f' % (epoch_idx + 1, batch_idx + 1, loss, loss_avg, time.time() - start_time)) # Checkpoints idx = epoch_idx * trainSet.__len__() + batch_idx + 1 if (idx >= options['training']['checkingPoints']['checkMin']) and ( idx % options['training']['checkingPoints']['checkFreq'] == 0): if config.do_eval: vloss = 0 total_tokens = 0 for bid, batch_data in enumerate(validLoader): inputs, positions, token_types, labels, masks, batch_seen, batch_similar, batch_unseen, batch_source = batch_data inputs = inputs[0].cuda() positions = positions[0].cuda() token_types = token_types[0].cuda() labels = labels[0].cuda() masks = masks[0].cuda() n_token = int((labels.data != config.PAD).data.sum()) with torch.no_grad(): net.eval() predicts = net(inputs, positions, token_types, masks) vloss += float(lossFunc(predicts, labels).sum()) total_tokens += n_token vloss /= total_tokens is_best = vloss < best_vloss best_vloss = min(vloss, best_vloss) LOG.log( 'CheckPoint: Validation Loss %11.8f, Best Loss %11.8f' % (vloss, best_vloss)) if is_best: LOG.log('Best Model Updated') save_check_point( { 'epoch': epoch_idx + 1, 'batch': batch_idx + 1, 'options': options, 'config': config, 'state_dict': net.state_dict(), 'best_vloss': best_vloss }, is_best, path=config.save_path, fileName='latest.pth.tar') counter = 0 else: counter += options['training']['checkingPoints'][ 'checkFreq'] if counter >= options['training']['stopConditions'][ 'rateReduce_bound']: counter = 0 for param_group in optimizer.param_groups: lr_ = param_group['lr'] param_group['lr'] *= 0.55 _lr = param_group['lr'] LOG.log( 'Reduce Learning Rate from %11.8f to %11.8f' % (lr_, _lr)) LOG.log('Current Counter = %d' % (counter)) else: save_check_point( { 'epoch': epoch_idx + 1, 'batch': batch_idx + 1, 'options': options, 'config': config, 'state_dict': net.state_dict(), 'best_vloss': 1e99 }, False, path=config.save_path, fileName='checkpoint_Epoch' + str(epoch_idx + 1) + '_Batch' + str(batch_idx + 1) + '.pth.tar') LOG.log('CheckPoint Saved!') if options['training']['checkingPoints']['everyEpoch']: save_check_point( { 'epoch': epoch_idx + 1, 'batch': batch_idx + 1, 'options': options, 'config': config, 'state_dict': net.state_dict(), 'best_vloss': 1e99 }, False, path=config.save_path, fileName='checkpoint_Epoch' + str(epoch_idx + 1) + '.pth.tar') LOG.log('Epoch Finished.') LOG.log( 'Total Seen: %d, Total Unseen: %d, Total Similar: %d, Total Source: %d.' % (total_seen, total_unseen, total_similar, total_source)) gc.collect()
if config.model_type=='LSTM': model = LSTMLM(input_size=len(vocab), embedding_size=config.embedding_size, hidden_size=config.hidden_size, output_size=len(vocab), n_layers=config.n_layers, dropout_p=config.dropout_p) elif config.model_type=='BiLSTM': model = BiLSTMLM(input_size=len(vocab), embedding_size=config.embedding_size, hidden_size=config.hidden_size, output_size=len(vocab), n_layers=config.n_layers, dropout_p=config.dropout_p) loss_fn = nn.NLLLoss(ignore_index=vocab.stoi[vocab.pad_token]) optimizer = optim.Adam(model.parameters()) if config.cuda: if config.multi_gpu: from parallel import DataParallelModel, DataParallelCriterion model = DataParallelModel(model).cuda() loss_fn = DataParallelCriterion(loss_fn).cuda() else: model = model.cuda() loss_fn = loss_fn.cuda() print('=========MODEL=========\n',model) # Train for epoch in range(1, config.epochs+1): train()
def do_eval(model, logger, output_dir, device, tr_loss, nb_tr_steps, global_step, processor, label_list, tokenizer, eval_dataloader, error_analysis_dict, output_mode, i, task): model.eval() eval_loss, eval_accuracy = 0, 0 nb_eval_steps, nb_eval_examples = 0, 0 preds = [] all_label_ids = [] all_input_ids = [] for input_ids, input_mask, segment_ids, label_ids in tqdm( eval_dataloader, desc='Evaluating'): input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) label_ids = label_ids.to(device) with torch.no_grad(): logits = model(input_ids, segment_ids, input_mask, i, output_mode) if output_mode == 'classification': loss_fct = CrossEntropyLoss() loss_fct = DataParallelCriterion(loss_fct) logits = [ logits[i].view(-1, logits[0].size(1)) for i in range(len(logits)) ] tmp_eval_loss = loss_fct(logits, label_ids.view(-1)) else: loss_fct = MSELoss() loss_fct = DataParallelCriterion(loss_fct) logits = [logits[i].view(-1) for i in range(len(logits))] tmp_eval_loss = loss_fct(logits, label_ids.view(-1)) logits = gather(logits, target_device='cuda:0') if len(preds) == 0: preds.append(logits.detach().cpu().numpy()) else: preds[0] = np.append(preds[0], logits.detach().cpu().numpy(), axis=0) if len(all_label_ids) == 0: all_label_ids.append(label_ids.detach().cpu().numpy()) else: all_label_ids[0] = np.append(all_label_ids[0], label_ids.detach().cpu().numpy(), axis=0) if len(all_input_ids) == 0: all_input_ids.append(input_ids.detach().cpu().numpy()) else: all_input_ids[0] = np.append(all_input_ids[0], input_ids.detach().cpu().numpy(), axis=0) eval_loss += tmp_eval_loss.mean().item() nb_eval_steps += 1 eval_loss = eval_loss / nb_eval_steps preds = preds[0] all_label_ids = all_label_ids[0] all_input_ids = all_input_ids[0] all_pids = error_analysis_dict['pids'][:len(preds)] all_text_a = error_analysis_dict['text_a'][:len(preds)] all_text_b = error_analysis_dict['text_b'][:len(preds)] all_textpair_tokenized = [ ' '.join(tokenizer.convert_ids_to_tokens(ids)) for ids in all_input_ids ] assert len(preds) == len(all_label_ids) == len(all_input_ids) == len( all_pids) == len(all_text_a) == len(all_text_b) == len( all_textpair_tokenized) all_textpair_tokenized = [ tp.replace('[PAD]', '').strip() for tp in all_textpair_tokenized ] if output_mode == 'classification': preds = np.argmax(preds, axis=1) preds_rounded = preds eval_accuracy = accuracy(preds, all_label_ids) else: preds = np.squeeze(preds) preds_rounded = np.round(preds * 4) / 4 eval_accuracy = pearsonr(preds, all_label_ids)[0] errors = generate_errors(preds, preds_rounded, all_label_ids, all_pids, all_text_a, all_text_b, all_textpair_tokenized) if i == 0: errors.to_csv(os.path.join(output_dir, 'error_table.csv'), sep=',', index=False) result = { 'task name': task, 'eval_loss': eval_loss, 'eval_accuracy': eval_accuracy, 'global_step': global_step, 'loss': tr_loss / nb_tr_steps } output_eval_file = os.path.join(output_dir, 'eval_results.txt') with open(output_eval_file, 'w') as writer: logger.info('******** Eval Results *****') for key in sorted(result.keys()): logger.info(' %s = %s', key, str(result[key])) # writer.write('{} = {}\n'.format(key, str(result[key]))) return eval_accuracy
def main(args): init(args) #Args setup: save_dir = os.path.join(args.output_dir, args.experiment_name, "checkpoints") save_dir_local = "checkpoints_local" desc = args.desc data_dir = args.data_dir log_dir = os.path.join(args.output_dir, args.experiment_name, "logs") os.makedirs(log_dir, exist_ok=True) os.makedirs(save_dir, exist_ok=True) os.makedirs(save_dir_local, exist_ok=True) train_log_interval = args.train_log_interval val_log_interval = args.val_log_interval beam = args.beam p = args.p n_ctx = args.n_ctx gen_len = args.gen_len k = args.k decoding_strategy = args.decoding_strategy accum_iter = args.accum_iter device = torch.device("cuda" if torch.cuda.is_available() else "cpu") n_gpu = torch.cuda.device_count() print("device", device, "n_gpu", n_gpu) logger = Logger(log_dir) #Text Encoder if args.use_offline_gpt2: text_encoder = GPT2Tokenizer.from_pretrained('./gpt2model') elif args.debug_mode: text_encoder = GPT2Tokenizer.from_pretrained('gpt2') else: text_encoder = GPT2Tokenizer.from_pretrained('gpt2-medium') text_encoder.add_special_tokens({ 'bos_token': '_start_', 'cls_token': '_classify_', 'eos_token': '_end_', 'additional_special_tokens': ['_kw_', '_endkw_', '_t_', '_i_', '_b_', '_c_'] }) vocab = len(text_encoder) print("Loading dataset...") if args.use_model == "base": train_loader = get_paragraph_input_loader( os.path.join(data_dir, "train_encoded.jsonl"), args.n_batch, text_encoder, num_workers=3, shuffle=True, gen_len=gen_len, n_ctx=n_ctx, include_discourse_type=args.use_discourse, include_neigh=args.use_neighbor_feat, max_size=args.max_ex, include_kw=not args.exclude_kw, dim=args.n_embd, debug_mode=args.debug_mode) val_loader = get_paragraph_input_loader( os.path.join(data_dir, "val_encoded.jsonl"), n_gpu, text_encoder, num_workers=0, shuffle=False, gen_len=gen_len, n_ctx=n_ctx, include_discourse_type=args.use_discourse, include_neigh=args.use_neighbor_feat, max_size=args.num_val_examples, include_kw=not args.exclude_kw, dim=args.n_embd, debug_mode=args.debug_mode) print("Train length: {}, Validation length: {}".format( len(train_loader), len(val_loader))) doc_model = GPT2BaseModel(args, vocab=vocab, n_ctx=n_ctx, gen_len=gen_len, lastidx=text_encoder.eos_token_id, includeprev=args.use_neighbor_feat, use_offline_gpt2=args.use_offline_gpt2) elif args.use_model == "plotmachines": #asli train_loader = get_paragraph_memory_input_loader( os.path.join(data_dir, "train_encoded.jsonl"), args.n_batch, text_encoder, num_workers=3, shuffle=True, gen_len=gen_len, n_ctx=n_ctx, include_discourse_type=args.use_discourse, include_neigh=args.use_neighbor_feat, max_size=args.max_ex, include_kw=not args.exclude_kw, memsize=args.memstatesize, dim=args.n_embd, use_kwmem=True, debug_mode=args.debug_mode) val_loader = get_paragraph_memory_input_loader( os.path.join(data_dir, "val_encoded.jsonl"), n_gpu, text_encoder, num_workers=0, shuffle=False, gen_len=gen_len, n_ctx=n_ctx, include_discourse_type=args.use_discourse, include_neigh=args.use_neighbor_feat, max_size=args.num_val_examples, include_kw=not args.exclude_kw, memsize=args.memstatesize, dim=args.n_embd, use_kwmem=True, debug_mode=args.debug_mode) print("Train length: {}, Validation length: {}".format( len(train_loader), len(val_loader))) doc_model = PlotMachinesModel(args, vocab=vocab, n_ctx=n_ctx, gen_len=gen_len, lastidx=text_encoder.eos_token_id, includeprev=args.use_neighbor_feat, use_offline_gpt2=args.use_offline_gpt2) n_updates_total = (len(train_loader) // args.accum_iter) * (args.num_epochs) if args.debug_mode: print_model_params(log_dir, doc_model) criterion = nn.CrossEntropyLoss(reduction="none") model_opt = AdamW(filter(lambda p: p.requires_grad, doc_model.parameters()), lr=args.lr, betas=(args.b1, args.b2), eps=args.e) lm_loss = ParagraphLoss(criterion, n_ctx=n_ctx, gen_len=gen_len) print("Loading Model") doc_model.to(device) if n_gpu > 1: doc_model = DataParallelModel(doc_model) lm_loss = DataParallelCriterion(lm_loss) print("Parallelized") bestloss = -1 start_iter, running_loss = 1, 0 prevloss = 1000 start_iter, running_loss = load_checkpoint(args.checkpoint, doc_model, model_opt) for i in range(args.num_epochs): start_iter, running_loss, bestloss, updates, val_loss1 = run_epoch( bestloss, start_iter, running_loss, doc_model, lm_loss, model_opt, train_loader, val_loader, train_log_interval, val_log_interval, device, beam, gen_len, k, p, decoding_strategy, accum_iter, "FT Training Epoch [{}/{}]".format(i + 1, args.num_epochs), save_dir, logger, text_encoder, show_progress=args.show_progress, my_local_dir=save_dir_local) print("VAL LOSS: ", str(val_loss1)) if val_loss1 > prevloss or math.isnan(val_loss1): break prevloss = val_loss1 print('Done training...') print('Evaluating on validation with best checkpoint...') bestcheck = os.path.join(save_dir, "checkpoint_best.pt") checkpoint = torch.load(bestcheck, map_location='cpu') state_dict = checkpoint["state_dict"] if state_dict.get('module.pos_emb_mask') is None and doc_model.state_dict( ).get('module.pos_emb_mask') is not None: state_dict['module.pos_emb_mask'] = doc_model.state_dict().get( 'module.pos_emb_mask') doc_model.load_state_dict(state_dict) evaluate_doc_model(doc_model, val_loader, text_encoder, device, beam, gen_len, k, p, args.decoding_strategy, os.path.join(save_dir, 'valeval.log'), 'gen', 'tgt', gen_len, [], args)
def main_tr(args, crossVal): dataLoad = ld.LoadData(args.data_dir, args.classes) data = dataLoad.processData(crossVal, args.data_name) # load the model model = net.MiniSeg(args.classes, aux=True) if not osp.isdir(osp.join(args.savedir + '_mod' + str(args.max_epochs))): os.mkdir(args.savedir + '_mod' + str(args.max_epochs)) if not osp.isdir( osp.join(args.savedir + '_mod' + str(args.max_epochs), args.data_name)): os.mkdir( osp.join(args.savedir + '_mod' + str(args.max_epochs), args.data_name)) saveDir = args.savedir + '_mod' + str( args.max_epochs) + '/' + args.data_name + '/' + args.model_name # create the directory if not exist if not osp.exists(saveDir): os.mkdir(saveDir) if args.gpu and torch.cuda.device_count() > 1: #model = torch.nn.DataParallel(model) model = DataParallelModel(model) if args.gpu: model = model.cuda() total_paramters = sum([np.prod(p.size()) for p in model.parameters()]) print('Total network parameters: ' + str(total_paramters)) # define optimization criteria weight = torch.from_numpy( data['classWeights']) # convert the numpy array to torch if args.gpu: weight = weight.cuda() criteria = CrossEntropyLoss2d(weight, args.ignore_label) #weight if args.gpu and torch.cuda.device_count() > 1: criteria = DataParallelCriterion(criteria) if args.gpu: criteria = criteria.cuda() # compose the data with transforms trainDataset_main = myTransforms.Compose([ myTransforms.Normalize(mean=data['mean'], std=data['std']), myTransforms.Scale(args.width, args.height), myTransforms.RandomCropResize(int(32. / 1024. * args.width)), myTransforms.RandomFlip(), myTransforms.ToTensor() ]) trainDataset_scale1 = myTransforms.Compose([ myTransforms.Normalize(mean=data['mean'], std=data['std']), myTransforms.Scale(int(args.width * 1.5), int(args.height * 1.5)), myTransforms.RandomCropResize(int(100. / 1024. * args.width)), myTransforms.RandomFlip(), myTransforms.ToTensor() ]) trainDataset_scale2 = myTransforms.Compose([ myTransforms.Normalize(mean=data['mean'], std=data['std']), myTransforms.Scale(int(args.width * 1.25), int(args.height * 1.25)), myTransforms.RandomCropResize(int(100. / 1024. * args.width)), myTransforms.RandomFlip(), myTransforms.ToTensor() ]) trainDataset_scale3 = myTransforms.Compose([ myTransforms.Normalize(mean=data['mean'], std=data['std']), myTransforms.Scale(int(args.width * 0.75), int(args.height * 0.75)), myTransforms.RandomCropResize(int(32. / 1024. * args.width)), myTransforms.RandomFlip(), myTransforms.ToTensor() ]) valDataset = myTransforms.Compose([ myTransforms.Normalize(mean=data['mean'], std=data['std']), myTransforms.Scale(args.width, args.height), myTransforms.ToTensor() ]) # since we training from scratch, we create data loaders at different scales # so that we can generate more augmented data and prevent the network from overfitting trainLoader = torch.utils.data.DataLoader(myDataLoader.Dataset( data['trainIm'], data['trainAnnot'], transform=trainDataset_main), batch_size=args.batch_size, shuffle=True, num_workers=args.num_workers, pin_memory=True, drop_last=True) trainLoader_scale1 = torch.utils.data.DataLoader( myDataLoader.Dataset(data['trainIm'], data['trainAnnot'], transform=trainDataset_scale1), batch_size=args.batch_size, shuffle=True, num_workers=args.num_workers, pin_memory=True, drop_last=True) trainLoader_scale2 = torch.utils.data.DataLoader( myDataLoader.Dataset(data['trainIm'], data['trainAnnot'], transform=trainDataset_scale2), batch_size=args.batch_size, shuffle=True, num_workers=args.num_workers, pin_memory=True, drop_last=True) trainLoader_scale3 = torch.utils.data.DataLoader( myDataLoader.Dataset(data['trainIm'], data['trainAnnot'], transform=trainDataset_scale3), batch_size=args.batch_size, shuffle=True, num_workers=args.num_workers, pin_memory=True, drop_last=True) valLoader = torch.utils.data.DataLoader(myDataLoader.Dataset( data['valIm'], data['valAnnot'], transform=valDataset), batch_size=args.batch_size, shuffle=False, num_workers=args.num_workers, pin_memory=True) max_batches = len(trainLoader) + len(trainLoader_scale1) + len( trainLoader_scale2) + len(trainLoader_scale3) if args.gpu: cudnn.benchmark = True start_epoch = 0 if args.pretrained is not None: state_dict = torch.load(args.pretrained) new_keys = [] new_values = [] for idx, key in enumerate(state_dict.keys()): if 'pred' not in key: new_keys.append(key) new_values.append(list(state_dict.values())[idx]) new_dict = OrderedDict(list(zip(new_keys, new_values))) model.load_state_dict(new_dict, strict=False) print('pretrained model loaded') if args.resume is not None: if osp.isfile(args.resume): print("=> loading checkpoint '{}'".format(args.resume)) checkpoint = torch.load(args.resume) start_epoch = checkpoint['epoch'] args.lr = checkpoint['lr'] model.load_state_dict(checkpoint['state_dict']) print("=> loaded checkpoint '{}' (epoch {})".format( args.resume, checkpoint['epoch'])) else: print("=> no checkpoint found at '{}'".format(args.resume)) log_file = osp.join(saveDir, 'trainValLog_' + args.model_name + '.txt') if osp.isfile(log_file): logger = open(log_file, 'a') else: logger = open(log_file, 'w') logger.write("Parameters: %s" % (str(total_paramters))) logger.write("\n%s\t%s\t\t%s\t%s\t%s\t%s\tlr" % ('CrossVal', 'Epoch', 'Loss(Tr)', 'Loss(val)', 'mIOU (tr)', 'mIOU (val)')) logger.flush() optimizer = torch.optim.Adam(model.parameters(), args.lr, (0.9, 0.999), eps=1e-08, weight_decay=1e-4) maxmIOU = 0 maxEpoch = 0 print(args.model_name + '-CrossVal: ' + str(crossVal + 1)) for epoch in range(start_epoch, args.max_epochs): # train for one epoch cur_iter = 0 train(args, trainLoader_scale1, model, criteria, optimizer, epoch, max_batches, cur_iter) cur_iter += len(trainLoader_scale1) train(args, trainLoader_scale2, model, criteria, optimizer, epoch, max_batches, cur_iter) cur_iter += len(trainLoader_scale2) train(args, trainLoader_scale3, model, criteria, optimizer, epoch, max_batches, cur_iter) cur_iter += len(trainLoader_scale3) lossTr, overall_acc_tr, per_class_acc_tr, per_class_iu_tr, mIOU_tr, lr = \ train(args, trainLoader, model, criteria, optimizer, epoch, max_batches, cur_iter) # evaluate on validation set lossVal, overall_acc_val, per_class_acc_val, per_class_iu_val, mIOU_val = \ val(args, valLoader, model, criteria) torch.save( { 'epoch': epoch + 1, 'arch': str(model), 'state_dict': model.state_dict(), 'optimizer': optimizer.state_dict(), 'lossTr': lossTr, 'lossVal': lossVal, 'iouTr': mIOU_tr, 'iouVal': mIOU_val, 'lr': lr }, osp.join( saveDir, 'checkpoint_' + args.model_name + '_crossVal' + str(crossVal + 1) + '.pth.tar')) # save the model also model_file_name = osp.join( saveDir, 'model_' + args.model_name + '_crossVal' + str(crossVal + 1) + '_' + str(epoch + 1) + '.pth') torch.save(model.state_dict(), model_file_name) logger.write( "\n%d\t\t%d\t\t%.4f\t\t%.4f\t\t%.4f\t\t%.4f\t\t%.7f" % (crossVal + 1, epoch + 1, lossTr, lossVal, mIOU_tr, mIOU_val, lr)) logger.flush() print("\nEpoch No. %d:\tTrain Loss = %.4f\tVal Loss = %.4f\t mIOU(tr) = %.4f\t mIOU(val) = %.4f\n" \ % (epoch + 1, lossTr, lossVal, mIOU_tr, mIOU_val)) if mIOU_val >= maxmIOU: maxmIOU = mIOU_val maxEpoch = epoch + 1 torch.cuda.empty_cache() logger.flush() logger.close() return maxEpoch, maxmIOU
def train(task_ids, model): tasks = [args.tasks[task_id] for task_id in task_ids] logger.info("start to train { task: %s, seq train type: %s }" % (tasks, args.seq_train_type)) model_dir = get_model_dir(tasks) make_dir(model_dir) #train_dataset = [(TASK_DICT[t]["train"] if not args.seq_distil else TASK_DICT[t]["train"].replace("train", "distil")) for t in tasks] train_dataset = [ swap_name(TASK_DICT[t]["train"], args.seq_distil, args.ref1) for t in tasks ] train_extra_data = [] if "lll" in args.seq_train_type and task_ids[0] > 0 and not args.skip_tasks: prev_task = args.tasks[task_ids[0] - 1] with torch.no_grad(): create_extra_data(tasks[0], prev_task, model, train_extra_data) elif "gem" in args.seq_train_type and task_ids[0] > 0: get_real_data(tasks[0], train_extra_data, accum=False, encode=True) args.memory_data.append(train_extra_data) train_extra_data = [] logger.info('extra training data size: {}'.format(len(train_extra_data))) if not model: # which_model_to_load = model_dir if os.path.isfile(os.path.join(model_dir, FINAL_SAVE_NAME)) else args.model_name model = MODEL_CLASS.from_pretrained(args.model_name).cuda() model.resize_token_embeddings(len(TOKENIZER)) if not args.fp32: model = FP16_Module(model) gen_token = get_gen_token(tasks[0]) TOKENIZER.add_tokens([gen_token]) TOKENIZER.save_pretrained(model_dir) SPECIAL_TOKENS[tasks[0]] = gen_token SPECIAL_TOKEN_IDS[tasks[0]] = TOKENIZER.convert_tokens_to_ids(gen_token) logger.info('gen token = {} , gen token id = {}'.format( gen_token, SPECIAL_TOKEN_IDS[tasks[0]])) MODEL_CONFIG.vocab_size = len(TOKENIZER) MODEL_CONFIG.to_json_file(os.path.join(model_dir, CONFIG_NAME)) global TOKENS_WEIGHT if len(TOKENIZER) != TOKENS_WEIGHT.shape[0]: TOKENS_WEIGHT = torch.cat((TOKENS_WEIGHT, torch.ones([1]).cuda())) if args.skip_tasks and len(tasks) == 1: logger.info("*********** skip task: {} ***********".format(tasks[0])) if tasks[0] in args.skip_tasks: if len(args.skip_tasks) == 1: model_dir = get_model_dir(tasks) model_path = os.path.join(model_dir, FINAL_SAVE_NAME) config_path = os.path.join(model_dir, CONFIG_NAME) model_config = CONFIG_CLASS.from_json_file(config_path) model = MODEL_CLASS(model_config).cuda() state_dict = torch.load(model_path) model.load_state_dict(state_dict) if not args.fp32: model = FP16_Module(model) if args.seq_train_type in REG_TYPE_KEYS: logger.info("calulating reg_params ...") train_qadata = QADataset(train_dataset, "train", SPECIAL_TOKEN_IDS[tasks[0]], train_extra_data) max_train_batch_size = max( len(train_qadata) // args.min_n_steps, args.min_batch_size) train_dataloader = create_dataloader( train_qadata, "train", max_train_batch_size) parallel_model = DataParallelModel(WrapModel(model), args.device_ids) regularizer = REG_TYPES[args.seq_train_type]( model, parallel_model, [train_dataloader], tasks[0]) regularizer.task_start_do() regularizer.task_end_do() torch.save(model.state_dict(), os.path.join(model_dir, FINAL_SAVE_NAME)) logger.info("done reg_params!") args.skip_tasks.remove(tasks[0]) return model model.resize_token_embeddings( len(TOKENIZER) if not args.multitask_specific else len(TOKENIZER) + 4) if args.multitask_specific: for i in range(4): TOKENS_WEIGHT = torch.cat((TOKENS_WEIGHT, torch.ones([1]).cuda())) if args.distil: teacher_model = MODEL_CLASS.from_pretrained(args.model_name).cuda() teacher_vocab_size = json.load( open("models/gpt2/lll/{task}_0.2/{task}/config.json".format( task=tasks[0])))['vocab_size'] teacher_model.resize_token_embeddings(teacher_vocab_size) print("load teacher model from {}".format( "models/gpt2/lll/{task}_0.2/{task}/model-finish".format( task=tasks[0]))) teacher_model.load_state_dict( torch.load("models/gpt2/lll/{task}_0.2/{task}/model-finish".format( task=tasks[0]))) if not args.fp32: teacher_model = FP16_Module(teacher_model) teacher_model.eval() teacher_model = DataParallelModel(WrapModel(teacher_model), args.device_ids) if not args.fp32: # again because resize_token_embeddings makes embedding layer fp32 model = FP16_Module(model) parallel_model = DataParallelModel(WrapModel(model), args.device_ids) train_qadata = QADataset(train_dataset, "train", SPECIAL_TOKEN_IDS[tasks[0]], train_extra_data) max_train_batch_size = max( len(train_qadata) // args.min_n_steps, args.min_batch_size) train_dataloader = create_dataloader(train_qadata, "train", max_train_batch_size) if not args.unbound and args.seq_train_type not in [ "multitask", "multilm" ]: #n_train_epochs = TASK_DICT[tasks[0]]["n_train_epochs"] n_train_epochs = args.n_train_epochs[tasks[0]] else: n_train_epochs = args.n_train_epochs['_'.join(tasks)] n_train_optimization_steps = len(train_qadata) * n_train_epochs logger.info( 'len of train dataset: {} , max train batch size {} , num of opt steps: {}' .format(len(train_qadata), max_train_batch_size, n_train_optimization_steps)) param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': args.weight_decay }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] if "gem" in args.seq_train_type: model.task_id = task_ids[0] if not hasattr(model, "grad_dims"): model.grad_dims = [] for param in model.parameters(): model.grad_dims.append(param.data.numel()) if not hasattr(model, "grads"): model.grads = torch.zeros(sum(model.grad_dims), len(args.tasks)) model.grads = model.grads.cuda() if args.seq_train_type in REG_TYPE_KEYS: optimizer = Weight_Regularized_AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) else: optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) if not args.fp32: optimizer = FP16_Optimizer(optimizer, static_loss_scale=None, dynamic_loss_scale=True, dynamic_loss_args={ 'scale_window': 100, 'min_scale': 1, 'delayed_shift': 2 }) scheduler = AnnealingLR(optimizer, start_lr=args.learning_rate, warmup_iter=int(args.n_warmup_ratio * len(train_qadata)), num_iters=int(n_train_optimization_steps), decay_style=args.decay_style) train_loss_fct = DataParallelCriterion( CrossEntropyLoss(ignore_index=FILL_VAL, weight=TOKENS_WEIGHT), args.device_ids) if args.distil: kd_loss_fct = DataParallelCriterion( nn.KLDivLoss(reduction="batchmean"), args.device_ids) if args.seq_train_type in REG_TYPE_KEYS: copy_train_dataloader = create_dataloader(train_qadata, "train", max_train_batch_size) prev_task = args.tasks[task_ids[0] - 1] regularizer = REG_TYPES[args.seq_train_type](model, parallel_model, [copy_train_dataloader], tasks[0], prev_task) regularizer.task_start_do() tot_n_steps = 0 train_once = TrainStep(model, optimizer, scheduler) if "gem" in args.seq_train_type and task_ids[0] != 0: gem_step = GEMStep(model, parallel_model, train_loss_fct, optimizer) model.train() for ep in range(n_train_epochs): cum_loss, cum_qa_loss, cum_lm_loss, cur_n_inputs = 0, 0, 0, 0 for n_steps, (_, _, cqa, _, Y, gen_X, gen_Y, is_extra) in enumerate(train_dataloader): n_inputs = sum(_cqa.shape[0] for _cqa in cqa) if args.multitask_specific: for i in range(len(is_extra)): gen_X[i][:, 0] += is_extra[i] is_extra[i] = is_extra[i] * 0 for i in range(len(cqa)): cqa[i] = (cqa[i].to(args.device_ids[i]), ) Y[i] = Y[i].to(args.device_ids[i]) gen_X[i] = (gen_X[i].to(args.device_ids[i]), ) gen_Y[i] = gen_Y[i].to(args.device_ids[i]) is_extra[i] = is_extra[i].to(args.device_ids[i]) if args.distil: losses = get_distil_losses(teacher_model, parallel_model, cqa, Y, gen_X, gen_Y, is_extra, kd_loss_fct, train_loss_fct, args.temperature_kd, pad_idx=FILL_VAL) else: losses = get_losses(parallel_model, cqa, Y, gen_X, gen_Y, train_loss_fct) loss = sum(losses) if "gem" in args.seq_train_type and task_ids[0] != 0: gem_step(task_ids[0]) train_once(loss, n_inputs) qa_loss = losses[0].item() * n_inputs lm_loss = losses[1].item() * n_inputs cum_loss += (qa_loss + lm_loss) cum_qa_loss += qa_loss cum_lm_loss += lm_loss cur_n_inputs += n_inputs if (n_steps + 1) % args.logging_steps == 0: logger.info( 'progress {:.3f} , lr {:.1E} , loss {:.3f} , qa loss {:.3f} , lm loss {:.3f} , avg batch size {:.1f}' .format(ep + cur_n_inputs / len(train_qadata), scheduler.get_lr(), cum_loss / cur_n_inputs, cum_qa_loss / cur_n_inputs, cum_lm_loss / cur_n_inputs, cur_n_inputs / (n_steps + 1))) torch.save(model.state_dict(), os.path.join(model_dir, SAVE_NAME + str(ep + 1))) tot_n_steps += (n_steps + 1) logger.info( 'epoch {}/{} done , tot steps {} , lr {:.1E} , loss {:.2f} , qa loss {:.2f} , lm loss {:.2f} , avg batch size {:.1f}' .format(ep + 1, n_train_epochs, tot_n_steps, scheduler.get_lr(), cum_loss / cur_n_inputs, cum_qa_loss / cur_n_inputs, cum_lm_loss / cur_n_inputs, cur_n_inputs / (n_steps + 1))) # task end do for reg if args.seq_train_type in REG_TYPE_KEYS: regularizer.task_end_do() torch.save(model.state_dict(), os.path.join(model_dir, FINAL_SAVE_NAME)) return model
def main(): parser = setup_parser() args = parser.parse_args() logger.info('@@@@@ START @@@@@') device = torch.device( 'cuda' if torch.cuda.is_available() and not args.no_cuda else 'cpu') n_gpu = torch.cuda.device_count() logger.info('device %s n_gpu %d', device, n_gpu) random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if not args.do_train and not args.do_eval: raise ValueError( "At least one of `do_train` or `do_eval` must be True.") args.train_batch_size = int(args.train_batch_size / args.gradient_accumulation_steps) bert_config = BertConfig.from_json_file(args.bert_config_file) if os.path.exists(args.output_dir) and os.listdir(args.output_dir): raise ValueError( "Output directory ({}) already exists and is not empty.".format( args.output_dir)) os.makedirs(args.output_dir, exist_ok=True) if args.tasks == 'all': task_names = ['medsts', 'mednli'] data_dirs = ['MEDSTS', 'MEDNLI'] elif args.tasks == 'single': task_names = ['medsts', 'mednli'] data_dirs = ['MEDSTS', 'MEDNLI'] task_names = [task_names[int(args.target_task_id)]] data_dirs = [data_dirs[int(args.target_task_id)]] if args.k_fold: target_data_dir = data_dirs[args.target_task_id] k_fold_data_dir = target_data_dir + '/k_fold_{}'.format(args.k) data_dirs[args.target_task_id] = k_fold_data_dir # if args.add_medsts_c: # assert args.k_fold==True # task_names.append('medsts_c') # data_dirs.append('MEDSTS_c') # k_fold_data_dir = data_dirs[-1] + '/k_fold_{}'.format(args.k) # data_dirs[-1] = k_fold_data_dir if task_names[0] not in processors: raise ValueError('Task not found: {}'.format(task_names[0])) processor_list = [processors[task_name]() for task_name in task_names] label_list = [processor.get_labels() for processor in processor_list] tokenizer = tokenization.FullTokenizer(vocab_file=args.vocab_file, do_lower_case=args.do_lower_case) train_examples = None num_train_steps = None num_tasks = len(task_names) if args.do_train: train_examples = [ processor.get_train_examples(args.data_dir + data_dir) for processor, data_dir in zip(processor_list, data_dirs) ] num_train_steps = int( len(train_examples[0]) / args.train_batch_size * args.num_train_epochs) if args.tasks == 'all': total_tr = args.tr_factor * num_tasks * int(args.num_train_epochs) else: total_tr = int(0.5 * num_train_steps) if args.tasks == 'all': steps_per_epoch = args.gradient_accumulation_steps * args.tr_factor * num_tasks else: steps_per_epoch = int(num_train_steps / (2. * args.num_train_epochs)) bert_config.num_tasks = num_tasks bert_config.hidden_size_aug = int(args.h_aug) model = BertForMultiTask(bert_config, [len(labels) for labels in label_list]) if args.init_checkpoint is not None: if args.multi: load_checkpoint_mult(args.init_checkpoint, model, args.same, args.tasks) else: model.bert.load_state_dict( torch.load(args.init_checkpoint, map_location='cpu')) if args.freeze: for n, p in model.bert.named_parameters(): if 'aug' in n or 'classifier' in n or 'mult' in n or 'gamma' in n or 'beta' in n: continue p.requires_grad = False model.to(device) if n_gpu > 1: model = DataParallelModel(model) group_size = 2 optimizer_parameters = get_param_groups(model, args, group_size) optimizer = BERTAdam(optimizer_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=total_tr) if args.do_eval: eval_loaders = [] error_analysis_dicts = [] for i, task in enumerate(task_names): eval_examples = processor_list[i].get_dev_examples(args.data_dir + data_dirs[i]) eval_features = convert_examples_to_features( eval_examples, label_list[i], args.max_seq_length, tokenizer, output_modes[task]) all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long) all_input_mask = torch.tensor( [f.input_mask for f in eval_features], dtype=torch.long) all_segment_ids = torch.tensor( [f.segment_ids for f in eval_features], dtype=torch.long) all_pids = [int(f.pid) for f in eval_examples] all_text_a = [f.text_a for f in eval_examples] all_text_b = [f.text_b for f in eval_examples] error_data = { 'pids': all_pids, 'text_a': all_text_a, 'text_b': all_text_b } if output_modes[task] == 'classification': all_label_ids = torch.tensor( [f.label_id for f in eval_features], dtype=torch.long) else: all_label_ids = torch.tensor( [f.label_id for f in eval_features], dtype=torch.float32) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) eval_sampler = SequentialSampler(eval_data) eval_loaders.append( DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size, drop_last=True)) error_analysis_dicts.append(error_data) global_step = 0 if args.do_train: loaders = [] logger.info(' Num tasks = {}'.format(len(train_examples))) for i, task in enumerate(task_names): train_features = convert_examples_to_features( train_examples[i], label_list[i], args.max_seq_length, tokenizer, output_modes[task]) logger.info('********* Training data for {}'.format(task)) logger.info(' Data size = {}'.format(len(train_features))) all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long) all_input_mask = torch.tensor( [f.input_mask for f in train_features], dtype=torch.long) all_segment_ids = torch.tensor( [f.segment_ids for f in train_features], dtype=torch.long) if output_modes[task] == 'classification': all_label_ids = torch.tensor( [f.label_id for f in train_features], dtype=torch.long) else: all_label_ids = torch.tensor( [f.label_id for f in train_features], dtype=torch.float32) train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) train_sampler = RandomSampler(train_data) loaders.append( iter( DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size, drop_last=True))) total_params = sum(p.numel() for p in model.parameters()) logger.info(' Num param = {}'.format(total_params)) loaders = [cycle(it) for it in loaders] model.train() best_target_score = 0. task_id = 0 all_ev_acc = [] for epoch in trange(int(args.num_train_epochs), desc='Epoch'): if args.sample == 'anneal': probs = [len(dataset) for dataset in train_examples] probs = anneal(probs, epoch, args.num_train_epochs, anneal_factor=0.8, target_task_id=0, weight=5) tr_loss = [0. for i in range(num_tasks)] nb_tr_examples, nb_tr_steps = 0, 0 ## DEBUG # steps_per_epoch = 5 for step in trange(steps_per_epoch, desc='Steps'): if step % args.gradient_accumulation_steps == 0: task_id = np.random.choice(len(probs), p=probs) output_mode = output_modes[task_names[task_id]] batch = next(loaders[task_id]) batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, label_ids = batch logits = model(input_ids, segment_ids, input_mask, task_id, output_mode) if output_mode == 'classification': loss_fct = CrossEntropyLoss() loss_fct = DataParallelCriterion(loss_fct) logits = [ logits[i].view(-1, logits[0].size(-1)) for i in range(len(logits)) ] loss = loss_fct(logits, label_ids.view(-1)) else: loss_fct = MSELoss() loss_fct = DataParallelCriterion(loss_fct) logits = [logits[i].view(-1) for i in range(len(logits))] loss = loss_fct(logits, label_ids.view(-1)) if n_gpu > 1: loss = loss.mean() if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps loss.backward() tr_loss[task_id] += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 if (step + 1) % args.gradient_accumulation_steps == 0: optimizer.step() model.zero_grad() global_step += 1 #this is where you'd calculate training acc ev_acc = [] for i, task in enumerate(task_names): acc = do_eval(model, logger, args.output_dir, device, tr_loss[i], nb_tr_steps, global_step, processor_list[i], label_list[i], tokenizer, eval_loaders[i], error_analysis_dicts[i], output_modes[task], i, task) ev_acc.append(acc) all_ev_acc.append(ev_acc) # logger.info('Average acc: {}'.format(np.mean(ev_acc))) if ev_acc[args.target_task_id] > best_target_score: best_target_score = ev_acc[args.target_task_id] model_to_save = model.module if hasattr(model, 'module') else model output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME) output_config_file = os.path.join(args.output_dir, CONFIG_NAME) torch.save(model_to_save.state_dict(), output_model_file) bert_config.to_json_file(output_config_file) tokenizer.save_vocabulary(args.output_dir) ##TODO: this is where you should add error analysis to get best version logger.info('Best target acc: {}'.format(best_target_score)) output_eval_file = os.path.join(args.output_dir, 'eval_results.txt') with open(output_eval_file, 'w') as writer: logger.info('******** Eval Results ********') for n, acc in enumerate(all_ev_acc): logger.info(' {} = {}\n'.format(n, acc)) writer.write('{} \t {}\n'.format(n, acc))
def main(): parser = setup_parser() args = parser.parse_args() processors = { 'stsb': StsbProcessor, 'mednli': MednliProcessor, 'medsts': MedstsProcessor } output_modes = { 'mnli': 'classification', 'stsb': 'regression', 'mednli': 'classification', 'medsts': 'regression' } bert_types = { 'discharge': '/home/dc925/project/data/clinicalbert/biobert_pretrain_output_disch_100000', 'all': '/home/dc925/project/data/clinicalbert/biobert_pretrain_output_all_notes_150000', 'base_uncased': 'bert-base-uncased', 'base_cased': 'bert-base-cased' } ################################################################################################## ################################### SETUP DATA, DEVICE, MODEL #################################### ################################################################################################## if args.local_rank == -1 or args.no_cuda: device = torch.device('cuda' if torch.cuda.is_available() and not args.no_cuda else 'cpu') n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device('cuda', args.local_rank) n_gpu = 1 #Initialize the distributed backend which will take care of synchronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logger.info( "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}". format(device, n_gpu, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(args.gradient_accumulation_steps)) args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if not args.do_train and not args.do_eval: raise ValueError( "At least one of `do_train` or `do_eval` must be True.") if os.path.exists(args.output_dir) and os.listdir( args.output_dir) and args.do_train: raise ValueError( "Output directory ({}) already exists and is not empty.".format( args.output_dir)) if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) task_name = args.task_name.lower() if task_name not in processors: raise ValueError("Task not found: {}".format(task_name)) processor = processors[task_name]() output_mode = output_modes[task_name] label_list = processor.get_labels(output_mode) num_labels = len(label_list) tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) train_examples = None num_train_optimization_steps = None if args.do_train: train_examples = processor.get_train_examples(args.data_dir) num_train_optimization_steps = int( len(train_examples) / args.train_batch_size / args.gradient_accumulation_steps) * args.num_train_epochs if args.local_rank != -1: num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size( ) cache_dir = args.cache_dir if args.cache_dir else os.path.join( str(PYTORCH_PRETRAINED_BERT_CACHE), 'distributed_{}'.format( args.local_rank)) model = BertForSequenceClassification.from_pretrained( args.bert_model, cache_dir=cache_dir, num_labels=num_labels) if args.fp16: model.half() model.to(device) if args.local_rank != -1: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) model = DDP(model) elif n_gpu > 1: # model = torch.nn.DataParallel(model) model = DataParallelModel(model) ################################################################################################## ########################################### OPTIMIZER ############################################ ################################################################################################## if args.do_train: param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] if args.discriminative_finetuning: group1 = ['layer.0', 'layer.1.'] group2 = ['layer.2', 'layer.3'] group3 = ['layer.4', 'layer.5'] group4 = ['layer.6', 'layer.7'] group5 = ['layer.8', 'layer.9'] group6 = ['layer.10', 'layer.11'] group_all = ['layer.0', 'layer.1.', 'layer.2', 'layer.3', 'layer.4', 'layer.5', \ 'layer.6', 'layer.7', 'layer.8', 'layer.9', 'layer.10', 'layer.11'] optimizer_grouped_parameters = [ {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay) and not any(nd in n for nd in group_all)], \ 'weight_decay': 0.01}, {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay) and any(nd in n for nd in group1)], \ 'weight_decay': 0.01, 'lr': args.learning_rate/2.6**5}, {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay) and any(nd in n for nd in group2)], \ 'weight_decay': 0.01, 'lr': args.learning_rate/2.6**4}, {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay) and any(nd in n for nd in group3)], \ 'weight_decay': 0.01, 'lr': args.learning_rate/2.6**3}, {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay) and any(nd in n for nd in group4)], \ 'weight_decay': 0.01, 'lr': args.learning_rate/2.6**2}, {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay) and any(nd in n for nd in group5)], \ 'weight_decay': 0.01, 'lr': args.learning_rate/2.6}, {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay) and any(nd in n for nd in group6)], \ 'weight_decay': 0.01, 'lr': args.learning_rate}, {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay) and not any(nd in n for nd in group_all)], \ 'weight_decay': 0.0}, {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay) and any(nd in n for nd in group1)], \ 'weight_decay': 0.0, 'lr': args.learning_rate/2.6**5}, {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay) and any(nd in n for nd in group2)], \ 'weight_decay': 0.0, 'lr': args.learning_rate/2.6**4}, {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay) and any(nd in n for nd in group3)], \ 'weight_decay': 0.0, 'lr': args.learning_rate/2.6**3}, {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay) and any(nd in n for nd in group4)], \ 'weight_decay': 0.0, 'lr': args.learning_rate/2.6**2}, {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay) and any(nd in n for nd in group5)], \ 'weight_decay': 0.0, 'lr': args.learning_rate/2.6}, {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay) and any(nd in n for nd in group6)], \ 'weight_decay': 0.0, 'lr': args.learning_rate}, ] else: optimizer_grouped_parameters = [{ 'params': [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) ], 'weight_decay': 0.01 }, { 'params': [ p for n, p in param_optimizer if any(nd in n for nd in no_decay) ], 'weight_decay': 0.0 }] if args.fp16: try: from apex.optimizers import FP16_Optimizer from apex.optimizers import FusedAdam except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) optimizer = FusedAdam(optimizer_grouped_parameters, lr=args.learning_rate, bias_correction=False, max_grad_norm=1.0) if args.loss_scale == 0: optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) else: optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale) warmup_linear = WarmupLinearSchedule( warmup=args.warmup_proportion, t_total=num_train_optimization_steps) else: optimizer = BertAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=num_train_optimization_steps) ################################################################################################## ############################################# TRAIN ############################################## ################################################################################################## global_step = 0 nb_tr_steps = 0 tr_loss = 0 if args.do_train: train_features = convert_examples_to_features(train_examples, label_list, args.max_seq_length, tokenizer, output_mode) logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_examples)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_optimization_steps) all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long) if output_mode == "classification": all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.long) elif output_mode == "regression": all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.float) train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) if args.local_rank == -1: train_sampler = RandomSampler(train_data) else: train_sampler = DistributedSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) if args.do_eval and (args.local_rank == -1 or torch.distributed.get_rank() == 0): eval_examples = processor.get_dev_examples(args.data_dir) eval_features = convert_examples_to_features(eval_examples, label_list, args.max_seq_length, tokenizer, output_mode) logger.info("***** Running evaluation *****") logger.info(" Num examples = %d", len(eval_examples)) logger.info(" Batch size = %d", args.eval_batch_size) all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long) if output_mode == "classification": all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.long) elif output_mode == "regression": all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.float) all_pids = np.array([f.pid for f in eval_features]) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size, drop_last=True) model.train() epoch_metric = {} for _ in trange(int(args.num_train_epochs), desc="Epoch"): tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 for step, batch in enumerate( tqdm(train_dataloader, desc="Iteration")): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, label_ids = batch # define a new function to compute loss values for both output_modes logits = model(input_ids, segment_ids, input_mask, labels=None) if output_mode == "classification": loss_fct = CrossEntropyLoss() loss_fct = DataParallelCriterion(loss_fct) logits = [ logits[i].view(-1, num_labels) for i in range(len(logits)) ] loss = loss_fct(logits, label_ids.view(-1)) elif output_mode == "regression": loss_fct = MSELoss() loss_fct = DataParallelCriterion(loss_fct) logits = [logits[i].view(-1) for i in range(len(logits))] loss = loss_fct(logits, label_ids.view(-1)) if n_gpu > 1: loss = loss.mean() #average on multi-gpu if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: optimizer.backward(loss) else: loss.backward() tr_loss += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16: #modify lr with special warm up BERT uses #if args.fp16 is False, BertAdam is used that handles this automatically lr_this_step = args.learning_rate * warmup_linear.get_lr( global_step, args.warmup_proportion) for param_group in optimizer.param_groups: param_group['lr'] = lr_this_step optimizer.step() optimizer.zero_grad() global_step += 1 with torch.no_grad(): model.eval() eval_loss = 0 nb_eval_steps = 0 preds = [] i = 0 for input_ids, input_mask, segment_ids, label_ids in tqdm( eval_dataloader, desc="Evaluating"): input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) label_ids = label_ids.to(device) with torch.no_grad(): logits = model(input_ids, segment_ids, input_mask, labels=None) if output_mode == 'classification': # loss_fct = CrossEntropyLoss() # tmp_eval_loss = loss_fct(logits.view(-1, num_labels), label_ids.view(-1)) loss_fct = CrossEntropyLoss() loss_fct = DataParallelCriterion(loss_fct) logits = [ logits[i].view(-1, num_labels) for i in range(len(logits)) ] tmp_eval_loss = loss_fct(logits, label_ids.view(-1)) elif output_mode == 'regression': # loss_fct = MSELoss() # tmp_eval_loss = loss_fct(logits.view(-1), label_ids.view(-1)) loss_fct = MSELoss() loss_fct = DataParallelCriterion(loss_fct) logits = [ logits[i].view(-1) for i in range(len(logits)) ] tmp_eval_loss = loss_fct(logits, label_ids.view(-1)) eval_loss += tmp_eval_loss.mean().item() nb_eval_steps += 1 logits = parallel.gather(logits, target_device='cuda:0') if len(preds) == 0: preds.append(logits.detach().cpu().numpy()) else: preds[0] = np.append(preds[0], logits.detach().cpu().numpy(), axis=0) eval_loss = eval_loss / nb_eval_steps preds = preds[0] if output_mode == 'classification': preds = np.argmax(preds, axis=1) elif output_mode == 'regression': preds = np.squeeze(preds) all_label_ids = all_label_ids[:preds.shape[0]] all_pids = all_pids[:preds.shape[0]] errors = generate_errors(preds, all_label_ids.numpy(), all_pids) result = compute_metrics(task_name, preds, all_label_ids.numpy()) loss = tr_loss / global_step if args.do_train else None result['eval_loss'] = eval_loss result['global_step'] = global_step result['loss'] = loss logger.info('***** Eval Results *****') for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) epoch_metric[_] = result[ 'pearson'] if output_mode == 'regression' else result['acc'] output_eval_file = os.path.join(args.output_dir, 'eval_results.txt') with open(output_eval_file, 'w') as writer: logger.info('***** Eval Results *****') # for key in sorted(result.keys()): # logger.info(" %s = %s", key, str(result[key])) # writer.write("%s = %s\n" % (key, str(result[key]))) # writer.write("{} {}\n".format("epoch","pearson")) for key in sorted(epoch_metric.keys()): writer.write("{}\t{}\t{}\t{}\n".format(key, str(epoch_metric[key]), args.learning_rate, args.train_batch_size)) errors.to_csv('errors.txt', sep='\t', index=False) ################################################################################################## ########################################## SAVE & RELOAD ######################################### ################################################################################################## if args.do_train: #Save a trained model, config, and tokenizer model_to_save = model.module if hasattr( model, 'module') else model #only save the model itself output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME) output_config_file = os.path.join(args.output_dir, CONFIG_NAME) torch.save(model_to_save.state_dict(), output_model_file) model_to_save.config.to_json_file(output_config_file) tokenizer.save_vocabulary(args.output_dir) model = BertForSequenceClassification.from_pretrained( args.output_dir, num_labels=num_labels) tokenizer = BertTokenizer.from_pretrained( args.output_dir, do_lower_case=args.do_lower_case) else: model = BertForSequenceClassification.from_pretrained( args.bert_model, num_labels=num_labels) model.to(device)
def __init__(self, model, mask_prob: float = 0.3, clip: int = 1, optimizer=None, beam_width: int = 5, max_len_a: float = 1.1, max_len_b: int = 5, len_penalty_ratio: float = 0.8, nll_loss: bool = False, fp16: bool = False, mm_mode="mixed", rank: int = -1): self.model = model self.clip = clip self.optimizer = optimizer self.device = torch.device( "cuda" if torch.cuda.is_available() else "cpu") self.num_gpu = torch.cuda.device_count() self.mask_prob = mask_prob if nll_loss: self.criterion = nn.NLLLoss( ignore_index=model.text_processor.pad_token_id()) else: self.criterion = SmoothedNLLLoss( ignore_index=model.text_processor.pad_token_id()) self.num_gpu = torch.cuda.device_count() self.fp16 = False self.rank = rank if rank >= 0: self.device = torch.device('cuda', rank) torch.cuda.set_device(self.device) self.model = self.model.to(self.device) if fp16: self.model, self.optimizer = amp.initialize(self.model, self.optimizer, opt_level="O2") self.fp16 = True self.generator = BeamDecoder(self.model, beam_width=beam_width, max_len_a=max_len_a, max_len_b=max_len_b, len_penalty_ratio=len_penalty_ratio) if rank >= 0: self.model = DistributedDataParallel(self.model, device_ids=[self.rank], output_device=self.rank, find_unused_parameters=True) self.generator = DistributedDataParallel( self.generator, device_ids=[self.rank], output_device=self.rank, find_unused_parameters=True) elif self.num_gpu > 1: print("Let's use", self.num_gpu, "GPUs!") self.model = DataParallelModel(self.model) self.criterion = DataParallelCriterion(self.criterion) self.generator = DataParallelModel(self.generator) self.reference = None self.best_bleu = -1.0 self.mm_mode = mm_mode
segm_model = DataParallelModel(segm_model) print("Let's use", torch.cuda.device_count(), "GPUs!") segm_model.to(device) '''if use_cuda: segm_model.cuda() seg_model = nn.DataParallel(seg_model)''' mul_transf = [ transforms.Resize(size=(img_size, img_size)), transforms.ToTensor() ] #optimizer = optim.SGD(segm_model.parameters(), lr=lr_rate, momentum=momentum) optimizer = optim.Adam(segm_model.parameters(), lr=0.0001) #criterion = nn.BCEWithLogitsLoss().cuda() if use_cuda else nn.BCEWithLogitsLoss() criterion = nn.BCEWithLogitsLoss() criterion = DataParallelCriterion(criterion) criterion.to(device) scheduler = optim.lr_scheduler.MultiStepLR(optimizer, milestones=milestones, gamma=gamma) train_loader, valid_loader = CellTrainValidLoader( data_transform=transforms.Compose(mul_transf), batch_sz=batch_size, workers=2) dict_loaders = {"train": train_loader, "valid": valid_loader} def train_model(cust_model,
def __init__(self, model, vocab_size, train_dataloader, test_dataloader=None, lr: float = 1e-4, betas=(0.9, 0.999), weight_decay: float = 0.01, warmup_steps=10000, with_cuda: bool = True, cuda_devices=None, log_freq: int = 10, include_next=False, include_vision=True, total_epochs=1): """ :param bert: BERT model which you want to train :param vocab_size: total word vocab size :param train_dataloader: train dataset data loader :param test_dataloader: test dataset data loader [can be None] :param lr: learning rate of optimizer :param betas: Adam optimizer betas :param weight_decay: Adam optimizer weight decay param :param with_cuda: traning with cuda :param log_freq: logging frequency of the batch iteration """ # Setup cuda device for BERT training, argument -c, --cuda should be true cuda_condition = torch.cuda.is_available() and with_cuda self.device = torch.device("cuda:0" if cuda_condition else "cpu") n_gpu = torch.cuda.device_count() print("device", device, "n_gpu", n_gpu) # Initialize the BERT Language Model, with BERT model self.model = model.to(self.device) self.bert = self.model.bert self.padding_idx = 0 self.include_next = include_next self.include_vision = include_vision # Distributed GPU training if CUDA can detect more than 1 GPU if with_cuda and torch.cuda.device_count() > 1: print("Using %d GPUS for BERT" % torch.cuda.device_count()) #self.model = nn.DataParallel(self.model, device_ids=range(torch.cuda.device_count())) self.model = DataParallelModel(self.model, device_ids=range( torch.cuda.device_count())) # Setting the train and test data loader self.train_data = train_dataloader self.test_data = test_dataloader # Setting the Adam optimizer with hyper-param self.optim = optim.Adamax(self.model.parameters(), lr=lr, betas=betas, weight_decay=weight_decay) if self.model.__class__.__name__ in [ 'DataParallel', 'DataParallelModel' ]: self.optim_schedule = ScheduledOptim( self.optim, self.model.module.bert.transformer_hidden_size, n_warmup_steps=warmup_steps) else: self.optim_schedule = ScheduledOptim( self.optim, self.model.bert.transformer_hidden_size, n_warmup_steps=warmup_steps) # Using Negative Log Likelihood Loss function for predicting the masked_token self.criterion = nn.NLLLoss(ignore_index=0) if with_cuda and torch.cuda.device_count() > 1: print("Using %d GPUS for BERT" % torch.cuda.device_count()) #self.model = nn.DataParallel(self.model, device_ids=range(torch.cuda.device_count())) self.criterion = DataParallelCriterion( self.criterion, device_ids=range(torch.cuda.device_count())) self.log_freq = log_freq self.total_iters = total_epochs * len(train_dataloader) print("Total Parameters:", sum([p.nelement() for p in self.model.parameters()]))
def train(args, train_dataset, model, tokenizer): """ Train the model """ if args.local_rank in [-1, 0]: tb_writer = SummaryWriter() args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu) train_sampler = RandomSampler(train_dataset) train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size) if args.max_steps > 0: t_total = args.max_steps args.num_train_epochs = args.max_steps // ( len(train_dataloader) // args.gradient_accumulation_steps) + 1 else: t_total = len( train_dataloader ) // args.gradient_accumulation_steps * args.num_train_epochs args.warmup_steps = t_total // 100 # Prepare optimizer and schedule (linear warmup and decay) optimizer_grouped_parameters = get_param_groups(args, model) optimizer = RAdam(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) scheduler = WarmupLinearSchedule(optimizer, warmup_steps=args.warmup_steps, t_total=t_total) if args.fp16: try: from apex import amp except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use fp16 training." ) model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level) # multi-gpu training (should be after apex fp16 initialization) if args.n_gpu > 1: # model = torch.nn.DataParallel(model) model = DataParallelModel(model) # Train! logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_dataset)) logger.info(" Num Epochs = %d", args.num_train_epochs) logger.info(" Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size) logger.info( " Total train batch size (w. parallel, distributed & accumulation) = %d", args.train_batch_size * args.gradient_accumulation_steps * (torch.distributed.get_world_size() if args.local_rank != -1 else 1)) logger.info(" Gradient Accumulation steps = %d", args.gradient_accumulation_steps) logger.info(" Total optimization steps = %d", t_total) args.logging_steps = len(train_dataloader) // 1 args.save_steps = args.logging_steps global_step = 0 tr_loss, logging_loss = 0.0, 0.0 model.zero_grad() train_iterator = trange(int(args.num_train_epochs), desc="Epoch") set_seed(args) for _ in train_iterator: args.current_epoch = _ epoch_iterator = tqdm(train_dataloader, desc="Iteration") for step, batch in enumerate(epoch_iterator): model.train() batch = tuple(t.to(args.device) for t in batch) inputs = { 'input_ids': batch[0], 'attention_mask': batch[1], 'token_type_ids': batch[2] if args.model_type in ['bert', 'xlnet'] else None, } # XLM and RoBERTa don't use segment_ids # 'labels': batch[3]} outputs = model(**inputs) outputs = [outputs[i][0] for i in range(len(outputs))] loss_fct = CrossEntropyLoss() loss_fct = DataParallelCriterion(loss_fct) loss = loss_fct(outputs, batch[3]) if args.n_gpu > 1: loss = loss.mean( ) # mean() to average on multi-gpu parallel training if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm) else: loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) tr_loss += loss.item() if (step + 1) % args.gradient_accumulation_steps == 0: optimizer.step() scheduler.step() model.zero_grad() global_step += 1 if args.logging_steps > 0 and global_step % args.logging_steps == 0: # Log metrics if args.local_rank == -1 and args.evaluate_during_training: # Only evaluate when single GPU otherwise metrics may not average well results = evaluate(args, model, tokenizer) for key, value in results.items(): tb_writer.add_scalar('eval_{}'.format(key), value, global_step) tb_writer.add_scalar('lr', scheduler.get_lr()[0], global_step) tb_writer.add_scalar('loss', (tr_loss - logging_loss) / args.logging_steps, global_step) logging_loss = tr_loss if args.save_steps > 0 and global_step % args.save_steps == 0: # Save model checkpoint output_dir = os.path.join( args.output_dir, 'checkpoint-{}'.format(global_step)) if not os.path.exists(output_dir): os.makedirs(output_dir) model_to_save = model.module if hasattr( model, 'module' ) else model # Take care of distributed/parallel training model_to_save.save_pretrained(output_dir) torch.save(args, os.path.join(output_dir, 'training_args.bin')) logger.info("Saving model checkpoint to %s", output_dir) if args.max_steps > 0 and global_step > args.max_steps: epoch_iterator.close() break if args.max_steps > 0 and global_step > args.max_steps: train_iterator.close() break if args.local_rank in [-1, 0]: tb_writer.close() return global_step, tr_loss / global_step
def __init__(self, metric_dict: Dict): super(ParallelMetricSet, self).__init__(metric_dict) self.metrics = { k: DataParallelCriterion(v) for k, v in metric_dict.items() }
def parallelize(self): self.parallel = True self.model = DataParallelModel(self.model) self.criterion = DataParallelCriterion(self.criterion)
# load the model model = BiSalNet() if args.onGPU and torch.cuda.device_count() > 1: # model = torch.nn.DataParallel(model) model = DataParallelModel(model) if args.onGPU: model = model.cuda() logger.info("Model Architecture:\n" + str(model)) total_paramters = sum([np.prod(p.size()) for p in model.parameters()]) logger.info('Total network parameters: ' + str(total_paramters)) criterion = CrossEntropyLoss() if args.onGPU and torch.cuda.device_count() > 1: criterion = DataParallelCriterion(criterion) if args.onGPU: criterion = criterion.cuda() train_losses = AverageMeter() train_batch_times = AverageMeter() train_data_times = AverageMeter() val_losses = AverageMeter() val_times = AverageMeter() record = { "loss": [], "lr": [], "val": { "F_beta": [], "MAE": []