def init_model(config): cfg, cfg_data, cfg_model, cfg_optim = read_config(config) device, n_gpu = utils.get_device() utils.set_seeds(cfg.seed, n_gpu) train_batch_size = int(cfg_optim.train_batch_size / cfg_optim.gradient_accumulation_steps) processor = get_class(cfg.task.lower()) processor.get_train_examples(cfg.data_dir) label_list = processor.get_labels() tokenizer = BertTokenizer.from_pretrained(cfg.bert_model, do_lower_case=cfg.do_lower_case) # Prepare model model = BertForSequenceClassification.from_pretrained( cfg.bert_model, cache_dir=PYTORCH_PRETRAINED_BERT_CACHE / 'distributed_{}'.format(-1), num_labels=len(label_list)) model.to(device) if not torch.cuda.is_available(): model.load_state_dict( torch.load(cfg.model_save_pth, map_location='cpu')['state_dict']) else: model.load_state_dict(torch.load(cfg.model_save_pth)['state_dict']) return model, processor, cfg_optim, label_list, tokenizer, device
def train(self): """ Full training logic """ t0 = time() for epoch in range(self.start_epoch, self.epochs): set_seeds(self.config['seeds'], epoch) # Train and Valid losses train_loss = self._train_epoch(epoch) valid_loss, accuracy, iou, _ = self._valid_epoch(epoch) time_elapsed = time() - t0 print("Epoch %1d completed after %4d secs" % (epoch, time_elapsed)) self._save_checkpoint(epoch, train_loss, valid_loss, accuracy, iou, save_last=True) if self.consecutive_stale >= self.consecutive_stale_break: break mlflow.log_metric('Loss', self.best_loss) mlflow.log_metric('IoU', self.best_iou) mlflow.log_metric('Accuracy', self.best_accuracy) mlflow.log_metric('Epoch', epoch) mlflow.end_run() return self.best_iou
def train(self): """ Full training logic """ t0 = time() for iteration in range(self.start_iter, self.nb_iters): set_seeds(self.config['seeds'], iteration) # Train and Valid losses L_semi, L_seg, L_disc = self._train_iter(iteration) iter_time = time() - t0 if iteration % self.save_period == 0: train_loss = (L_semi, L_seg, L_disc) valid_loss, accuracy, iou, _ = self._valid_iter(iteration) self.logger.info( '> [{}/{} ({:.0f}%), {:.2f}s] Semi_L: {:.6f} - Seg_L: {:.6f} - D_L: {:.6f}' .format( iteration, self.nb_iters, 100.0 * iteration / self.nb_iters, iter_time * (self.nb_iters - iteration) / (iteration - self.start_iter + 1), L_semi, L_seg, L_disc)) self.logger.info( '> [Valid Loss: {:.6f} - Accuracy: {:.2f} - IoU: {:.3f} '. format(valid_loss, accuracy, iou)) # Save the checkpoints. time_elapsed = time() - t0 print("Iteration %1d completed after %4d secs" % (iteration, time_elapsed)) self._save_checkpoint(iteration, train_loss, valid_loss, accuracy, iou, save_last=True) mlflow.log_metric('Loss', self.best_loss) mlflow.log_metric('IoU', self.best_iou) mlflow.log_metric('Accuracy', self.best_accuracy) mlflow.end_run() return self.best_iou
def predict(self): t0 = time() set_seeds(self.config['seeds']) for batch_id, sample in enumerate(self.predict_loader): image = sample['image'] image = image.to(self.device) name = sample['name'] initial_size = sample['initial_size'] prediction = self.make_prediction(image) self.save_prediction(prediction, name, initial_size, image) time_elapsed = time() - t0 print("Prediction completed after %4d secs" % (time_elapsed))
def main(): # Load Configuration model_cfg = configuration.model.from_json(cfg.model_cfg) # BERT_cfg set_seeds(cfg.seed) # Load Data & Create Criterion #data = load_data(cfg) #if cfg.uda_mode or cfg.mixmatch_mode: # data_iter = [data.sup_data_iter(), data.unsup_data_iter()] if cfg.mode=='train' \ # else [data.sup_data_iter(), data.unsup_data_iter(), data.eval_data_iter()] # train_eval #else: # data_iter = [data.sup_data_iter()] # my own implementation dataset = DataSet(cfg) train_dataset, val_dataset, unsup_dataset = dataset.get_dataset() # Create the DataLoaders for our training and validation sets. train_dataloader = DataLoader( train_dataset, # The training samples. sampler = RandomSampler(train_dataset), # Select batches randomly batch_size = cfg.train_batch_size # Trains with this batch size. ) validation_dataloader = DataLoader( val_dataset, # The validation samples. sampler = SequentialSampler(val_dataset), # Pull out batches sequentially. batch_size = cfg.eval_batch_size # Evaluate with this batch size. ) unsup_dataloader = None if unsup_dataset: unsup_dataloader = DataLoader( unsup_dataset, sampler = RandomSampler(unsup_dataset), batch_size = cfg.train_batch_size ) if cfg.uda_mode or cfg.mixmatch_mode: data_iter = [train_dataloader, unsup_dataloader, validation_dataloader] else: data_iter = [train_dataloader, validation_dataloader] ema_optimizer = None ema_model = None if cfg.model == "custom": model = models.Classifier(model_cfg, NUM_LABELS[cfg.task]) elif cfg.model == "bert": model = BertForSequenceClassificationCustom.from_pretrained( "bert-base-uncased", # Use the 12-layer BERT model, with an uncased vocab. num_labels = NUM_LABELS[cfg.task], output_attentions = False, # Whether the model returns attentions weights. output_hidden_states = False, # Whether the model returns all hidden-states. ) if cfg.uda_mode: if cfg.unsup_criterion == 'KL': unsup_criterion = nn.KLDivLoss(reduction='none') else: unsup_criterion = nn.MSELoss(reduction='none') sup_criterion = nn.CrossEntropyLoss(reduction='none') optimizer = optim.optim4GPU(cfg, model) elif cfg.mixmatch_mode: train_criterion = SemiLoss() criterion = nn.CrossEntropyLoss() optimizer = torch.optim.Adam(model.parameters(), lr=cfg.lr) ema_model = models.Classifier(model_cfg, NUM_LABELS[cfg.task]) for param in ema_model.parameters(): param.detach_() ema_optimizer= WeightEMA(cfg, model, ema_model, alpha=cfg.ema_decay) else: sup_criterion = nn.CrossEntropyLoss(reduction='none') optimizer = optim.optim4GPU(cfg, model) # Create trainer trainer = train.Trainer(cfg, model, data_iter, optimizer, get_device(), ema_model, ema_optimizer) # loss functions def get_sup_loss(model, sup_batch, unsup_batch, global_step): # batch input_ids, segment_ids, input_mask, og_label_ids, num_tokens = sup_batch # convert label ids to hot vectors sup_size = input_ids.size(0) label_ids = torch.zeros(sup_size, 2).scatter_(1, og_label_ids.cpu().view(-1,1), 1) label_ids = label_ids.cuda(non_blocking=True) # sup mixup sup_l = np.random.beta(cfg.alpha, cfg.alpha) sup_l = max(sup_l, 1-sup_l) sup_idx = torch.randperm(sup_size) if cfg.sup_mixup and 'word' in cfg.sup_mixup: if cfg.simple_pad: simple_pad(input_ids, input_mask, num_tokens) c_input_ids = None else: input_ids, c_input_ids = pad_for_word_mixup( input_ids, input_mask, num_tokens, sup_idx ) else: c_input_ids = None # sup loss hidden = model( input_ids=input_ids, segment_ids=segment_ids, input_mask=input_mask, output_h=True, mixup=cfg.sup_mixup, shuffle_idx=sup_idx, clone_ids=c_input_ids, l=sup_l, manifold_mixup=cfg.manifold_mixup, simple_pad=cfg.simple_pad, no_grad_clone=cfg.no_grad_clone ) logits = model(input_h=hidden) if cfg.sup_mixup: label_ids = mixup_op(label_ids, sup_l, sup_idx) sup_loss = -torch.sum(F.log_softmax(logits, dim=1) * label_ids, dim=1) if cfg.tsa and cfg.tsa != "none": tsa_thresh = get_tsa_thresh(cfg.tsa, global_step, cfg.total_steps, start=1./logits.shape[-1], end=1) larger_than_threshold = torch.exp(-sup_loss) > tsa_thresh # prob = exp(log_prob), prob > tsa_threshold # larger_than_threshold = torch.sum( F.softmax(pred[:sup_size]) * torch.eye(num_labels)[sup_label_ids] , dim=-1) > tsa_threshold loss_mask = torch.ones_like(og_label_ids, dtype=torch.float32) * (1 - larger_than_threshold.type(torch.float32)) sup_loss = torch.sum(sup_loss * loss_mask, dim=-1) / torch.max(torch.sum(loss_mask, dim=-1), torch_device_one()) else: sup_loss = torch.mean(sup_loss) return sup_loss, sup_loss, sup_loss, sup_loss def get_loss_ict(model, sup_batch, unsup_batch, global_step): # batch input_ids, segment_ids, input_mask, og_label_ids, num_tokens = sup_batch ori_input_ids, ori_segment_ids, ori_input_mask, \ aug_input_ids, aug_segment_ids, aug_input_mask, \ ori_num_tokens, aug_num_tokens = unsup_batch # convert label ids to hot vectors sup_size = input_ids.size(0) label_ids = torch.zeros(sup_size, 2).scatter_(1, og_label_ids.cpu().view(-1,1), 1) label_ids = label_ids.cuda(non_blocking=True) # sup mixup sup_l = np.random.beta(cfg.alpha, cfg.alpha) sup_l = max(sup_l, 1-sup_l) sup_idx = torch.randperm(sup_size) if cfg.sup_mixup and 'word' in cfg.sup_mixup: if cfg.simple_pad: simple_pad(input_ids, input_mask, num_tokens) c_input_ids = None else: input_ids, c_input_ids = pad_for_word_mixup( input_ids, input_mask, num_tokens, sup_idx ) else: c_input_ids = None # sup loss if cfg.model == "bert": logits = model( input_ids=input_ids, c_input_ids=c_input_ids, attention_mask=input_mask, mixup=cfg.sup_mixup, shuffle_idx=sup_idx, l=sup_l, manifold_mixup = cfg.manifold_mixup, no_pretrained_pool=cfg.no_pretrained_pool ) else: hidden = model( input_ids=input_ids, segment_ids=segment_ids, input_mask=input_mask, output_h=True, mixup=cfg.sup_mixup, shuffle_idx=sup_idx, clone_ids=c_input_ids, l=sup_l, manifold_mixup=cfg.manifold_mixup, simple_pad=cfg.simple_pad, no_grad_clone=cfg.no_grad_clone ) logits = model(input_h=hidden) if cfg.sup_mixup: label_ids = mixup_op(label_ids, sup_l, sup_idx) sup_loss = -torch.sum(F.log_softmax(logits, dim=1) * label_ids, dim=1) if cfg.tsa and cfg.tsa != "none": tsa_thresh = get_tsa_thresh(cfg.tsa, global_step, cfg.total_steps, start=1./logits.shape[-1], end=1) larger_than_threshold = torch.exp(-sup_loss) > tsa_thresh # prob = exp(log_prob), prob > tsa_threshold # larger_than_threshold = torch.sum( F.softmax(pred[:sup_size]) * torch.eye(num_labels)[sup_label_ids] , dim=-1) > tsa_threshold loss_mask = torch.ones_like(og_label_ids, dtype=torch.float32) * (1 - larger_than_threshold.type(torch.float32)) sup_loss = torch.sum(sup_loss * loss_mask, dim=-1) / torch.max(torch.sum(loss_mask, dim=-1), torch_device_one()) else: sup_loss = torch.mean(sup_loss) if cfg.no_unsup_loss: return sup_loss, sup_loss, sup_loss, sup_loss # unsup loss with torch.no_grad(): if cfg.model == "bert": ori_logits = model( input_ids = ori_input_ids, attention_mask = ori_input_mask, no_pretrained_pool=cfg.no_pretrained_pool ) else: ori_logits = model(ori_input_ids, ori_segment_ids, ori_input_mask) ori_prob = F.softmax(ori_logits, dim=-1) # KLdiv target # mixup l = np.random.beta(cfg.alpha, cfg.alpha) l = max(l, 1-l) idx = torch.randperm(hidden.size(0)) if cfg.mixup and 'word' in cfg.mixup: ori_input_ids, c_ori_input_ids = pad_for_word_mixup( ori_input_ids, ori_input_mask, ori_num_tokens, idx ) else: c_ori_input_ids = None #for i in range(0, batch_size): # new_mask = ori_input_mask[i] # new_ids = ori_input_ids[i] # old_ids = c_ori_input_ids[i] # pdb.set_trace() if cfg.model == "bert": logits = model( input_ids=ori_input_ids, c_input_ids=c_ori_input_ids, attention_mask=ori_input_mask, mixup=cfg.mixup, shuffle_idx=idx, l=l, manifold_mixup = cfg.manifold_mixup, no_pretrained_pool=cfg.no_pretrained_pool ) else: hidden = model( input_ids=ori_input_ids, segment_ids=ori_segment_ids, input_mask=ori_input_mask, output_h=True, mixup=cfg.mixup, shuffle_idx=idx, clone_ids=c_ori_input_ids, l=l, manifold_mixup=cfg.manifold_mixup, simple_pad=cfg.simple_pad, no_grad_clone=cfg.no_grad_clone ) logits = model(input_h=hidden) if cfg.mixup: ori_prob = mixup_op(ori_prob, l, idx) probs_u = torch.softmax(logits, dim=1) unsup_loss = torch.mean((probs_u - ori_prob)**2) w = cfg.uda_coeff * sigmoid_rampup(global_step, cfg.consistency_rampup_ends - cfg.consistency_rampup_starts) final_loss = sup_loss + w*unsup_loss return final_loss, sup_loss, unsup_loss, w*unsup_loss # evaluation def get_acc(model, batch): # input_ids, segment_ids, input_mask, label_id, sentence = batch input_ids, segment_ids, input_mask, label_id = batch logits = model(input_ids, segment_ids, input_mask) _, label_pred = logits.max(1) result = (label_pred == label_id).float() accuracy = result.mean() # output_dump.logs(sentence, label_pred, label_id) # output dump return accuracy, result if cfg.mode == 'train': trainer.train(get_loss, None, cfg.model_file, cfg.pretrain_file) if cfg.mode == 'train_eval': if cfg.mixmatch_mode: trainer.train(get_mixmatch_loss_short, get_acc, cfg.model_file, cfg.pretrain_file) elif cfg.uda_test_mode: trainer.train(get_sup_loss, get_acc, cfg.model_file, cfg.pretrain_file) elif cfg.uda_test_mode_two: trainer.train(get_loss_ict, get_acc, cfg.model_file, cfg.pretrain_file) else: trainer.train(get_sup_loss, get_acc, cfg.model_file, cfg.pretrain_file) if cfg.mode == 'eval': results = trainer.eval(get_acc, cfg.model_file, None) total_accuracy = torch.cat(results).mean().item() print('Accuracy :' , total_accuracy)
def main(cfg, model_cfg): # Load Configuration cfg = configuration.params.from_json(cfg) # Train or Eval cfg model_cfg = configuration.model.from_json(model_cfg) # BERT_cfg set_seeds(cfg.seed) # Load Data & Create Criterion data = load_data(cfg) if cfg.uda_mode: unsup_criterion = nn.KLDivLoss(reduction='none') data_iter = [data.sup_data_iter(), data.unsup_data_iter()] if cfg.mode=='train' \ else [data.sup_data_iter(), data.unsup_data_iter(), data.eval_data_iter()] # train_eval else: data_iter = [data.sup_data_iter()] sup_criterion = nn.CrossEntropyLoss(reduction='none') # Load Model model = models.Classifier(model_cfg, len(data.TaskDataset.labels)) # Create trainer trainer = train.Trainer(cfg, model, data_iter, optim.optim4GPU(cfg, model), get_device()) # Training def get_loss(model, sup_batch, unsup_batch, global_step): # logits -> prob(softmax) -> log_prob(log_softmax) # batch input_ids, segment_ids, input_mask, label_ids = sup_batch if unsup_batch: ori_input_ids, ori_segment_ids, ori_input_mask, \ aug_input_ids, aug_segment_ids, aug_input_mask = unsup_batch input_ids = torch.cat((input_ids, aug_input_ids), dim=0) segment_ids = torch.cat((segment_ids, aug_segment_ids), dim=0) input_mask = torch.cat((input_mask, aug_input_mask), dim=0) # logits logits = model(input_ids, segment_ids, input_mask) # sup loss sup_size = label_ids.shape[0] sup_loss = sup_criterion(logits[:sup_size], label_ids) # shape : train_batch_size if cfg.tsa: tsa_thresh = get_tsa_thresh(cfg.tsa, global_step, cfg.total_steps, start=1./logits.shape[-1], end=1) larger_than_threshold = torch.exp(-sup_loss) > tsa_thresh # prob = exp(log_prob), prob > tsa_threshold # larger_than_threshold = torch.sum( F.softmax(pred[:sup_size]) * torch.eye(num_labels)[sup_label_ids] , dim=-1) > tsa_threshold loss_mask = torch.ones_like(label_ids, dtype=torch.float32) * (1 - larger_than_threshold.type(torch.float32)) sup_loss = torch.sum(sup_loss * loss_mask, dim=-1) / torch.max(torch.sum(loss_mask, dim=-1), torch_device_one()) else: sup_loss = torch.mean(sup_loss) # unsup loss if unsup_batch: # ori with torch.no_grad(): ori_logits = model(ori_input_ids, ori_segment_ids, ori_input_mask) ori_prob = F.softmax(ori_logits, dim=-1) # KLdiv target # ori_log_prob = F.log_softmax(ori_logits, dim=-1) # confidence-based masking if cfg.uda_confidence_thresh != -1: unsup_loss_mask = torch.max(ori_prob, dim=-1)[0] > cfg.uda_confidence_thresh unsup_loss_mask = unsup_loss_mask.type(torch.float32) else: unsup_loss_mask = torch.ones(len(logits) - sup_size, dtype=torch.float32) unsup_loss_mask = unsup_loss_mask.to(_get_device()) # aug # softmax temperature controlling uda_softmax_temp = cfg.uda_softmax_temp if cfg.uda_softmax_temp > 0 else 1. aug_log_prob = F.log_softmax(logits[sup_size:] / uda_softmax_temp, dim=-1) # KLdiv loss """ nn.KLDivLoss (kl_div) input : log_prob (log_softmax) target : prob (softmax) https://pytorch.org/docs/stable/nn.html unsup_loss is divied by number of unsup_loss_mask it is different from the google UDA official The official unsup_loss is divided by total https://github.com/google-research/uda/blob/master/text/uda.py#L175 """ unsup_loss = torch.sum(unsup_criterion(aug_log_prob, ori_prob), dim=-1) unsup_loss = torch.sum(unsup_loss * unsup_loss_mask, dim=-1) / torch.max(torch.sum(unsup_loss_mask, dim=-1), torch_device_one()) final_loss = sup_loss + cfg.uda_coeff*unsup_loss return final_loss, sup_loss, unsup_loss return sup_loss, None, None # evaluation def get_acc(model, batch): # input_ids, segment_ids, input_mask, label_id, sentence = batch input_ids, segment_ids, input_mask, label_id = batch logits = model(input_ids, segment_ids, input_mask) _, label_pred = logits.max(1) result = (label_pred == label_id).float() accuracy = result.mean() # output_dump.logs(sentence, label_pred, label_id) # output dump return accuracy, result if cfg.mode == 'train': trainer.train(get_loss, None, cfg.model_file, cfg.pretrain_file) if cfg.mode == 'train_eval': trainer.train(get_loss, get_acc, cfg.model_file, cfg.pretrain_file) if cfg.mode == 'eval': results = trainer.eval(get_acc, cfg.model_file, None) total_accuracy = torch.cat(results).mean().item() print('Accuracy :' , total_accuracy)
else: torch.cuda.set_device(args.local_rank) args.device = torch.device("cuda", args.local_rank) args.n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') # Log GPU information logger.add_text('info', f"args: {args}") # Modify batch size if accumulating gradients args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps # Reproducibility utils.set_seeds(args.seed, multi_gpu=args.n_gpu > 0) # Build dataloaders tokenizer = tokenization.FullTokenizer(args.vocab, do_lower_case=args.do_lower_case) tokenize = lambda x: tokenizer.tokenize(tokenizer.convert_to_unicode(x)) pipeline = [ PipelineForPretrain( max_pred=20, # what is this? mask_prob=0.15, # actually this does nothing vocab_words=list(tokenizer.vocab.keys()), # indexer=tokenizer.convert_tokens_to_ids, max_len=args.max_seq_length) ] dataloader = SentencePairDataLoader(args.text_file, batch_size=args.train_batch_size,
pr_auc = cal_pr_auc(total_scores, total_labels) acc = cal_accuracy(total_scores, total_labels) pre = cal_precision(total_scores, total_labels) rec = cal_recall(total_scores, total_labels) far = cal_false_alarm(total_scores, total_labels) spe = cal_specific(total_scores, total_labels) rmse = cal_rmse(total_scores, total_labels) gap = cal_score_gap(total_scores, total_labels) gm = cal_geometric_mean(total_scores, total_labels) mcc = cal_MCC(total_scores, total_labels) sen = cal_sensitivity(total_scores, total_labels) f = cal_f_measure(total_scores, total_labels) pauc = cal_pAUC(total_scores, total_labels) fnr = cal_false_neg(total_scores, total_labels) print('AUC\t {}\tPR_AUC\t{}\tpAUC\t{}'.format(auc, pr_auc, pauc)) print('FAR\t{}\tFNR\t{}\tGM\t{}'.format(far, fnr, gm)) print('Precision\t{}\tRecall\t{}'.format(pre, rec)) print('Acc\t{}\tMCC\t{}'.format(acc, mcc)) print('Sen\t{}\tSpe\t{}'.format(sen, spe)) print('Gap\t{}\tRMSE\t{}'.format(gap, rmse)) print('F\t{}'.format(f)) return auc if __name__ == '__main__': args = parse_args() set_seeds(args.seed) show_params(args) train_AR_Net(args) show_params(args)
def train(config): cfg, cfg_data, cfg_model, cfg_optim = read_config(config) device, n_gpu = utils.get_device() utils.set_seeds(cfg.seed, n_gpu) train_batch_size = int(cfg_optim.train_batch_size / cfg_optim.gradient_accumulation_steps) processor = get_class(cfg.task.lower()) tokenizer = BertTokenizer.from_pretrained(cfg.bert_model, do_lower_case=cfg.do_lower_case) train_examples = None num_train_steps = None if cfg.do_train: train_examples = processor.get_train_examples(cfg_data.data_dir) num_train_steps = int( len(train_examples) / train_batch_size / cfg_optim.gradient_accumulation_steps * cfg_optim.num_train_epochs) label_list = processor.get_labels() # Prepare model print(PYTORCH_PRETRAINED_BERT_CACHE / 'distributed_{}'.format(-1)) model = BertForSequenceClassification.from_pretrained( cfg.bert_model, cache_dir=PYTORCH_PRETRAINED_BERT_CACHE / 'distributed_{}'.format(-1), num_labels=len(label_list)) model.to(device) # Prepare optimizer if cfg_optim.optimize_on_cpu: param_optimizer = [(n, param.clone().detach().to('cpu').requires_grad_()) \ for n, param in model.named_parameters()] else: param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'gamma', 'beta'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay_rate': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay_rate': 0.0 }] t_total = num_train_steps optimizer = BertAdam(optimizer_grouped_parameters, lr=cfg_optim.learning_rate, warmup=cfg_optim.warmup_proportion, t_total=t_total) global_step = 0 if cfg.do_train: train_features = convert_examples_to_features(train_examples, label_list, cfg_optim.max_seq_length, tokenizer, show_exp=False) logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_examples)) logger.info(" Batch size = %d", train_batch_size) logger.info(" Num steps = %d", num_train_steps) train_dataloader = convert_features_to_tensors(train_features, train_batch_size) model.train() best_score = 0 flags = 0 for _ in trange(int(cfg_optim.num_train_epochs), desc="Epoch"): for step, batch in enumerate( tqdm(train_dataloader, desc="Iteration")): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, label_ids = batch loss = model(input_ids, segment_ids, input_mask, label_ids) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if cfg_optim.fp16 and cfg_optim.loss_scale != 1.0: # rescale loss for fp16 training # see https://docs.nvidia.com/deeplearning/sdk/mixed-precision-training/index.html loss = loss * cfg_optim.loss_scale if cfg_optim.gradient_accumulation_steps > 1: loss = loss / cfg_optim.gradient_accumulation_steps loss.backward() if (step + 1) % cfg_optim.gradient_accumulation_steps == 0: if cfg_optim.optimize_on_cpu: if cfg_optim.fp16 and cfg_optim.loss_scale != 1.0: # scale down gradients for fp16 training for param in model.parameters(): if param.grad is not None: param.grad.data = param.grad.data / cfg_optim.loss_scale is_nan = utils.set_optimizer_params_grad( param_optimizer, model.named_parameters(), test_nan=True) if is_nan: logger.info( "FP16 TRAINING: Nan in gradients, reducing loss scaling" ) cfg_optim.loss_scale = cfg_optim.loss_scale / 2 model.zero_grad() continue optimizer.step() utils.copy_optimizer_params_to_model( model.named_parameters(), param_optimizer) else: optimizer.step() model.zero_grad() f1 = evaluate(model, processor, cfg_optim, label_list, tokenizer, device) if f1 > best_score: best_score = f1 print('*f1 score = {}'.format(f1)) flags = 0 checkpoint = {'state_dict': model.state_dict()} torch.save(checkpoint, cfg_optim.model_save_pth) else: print('f1 score = {}'.format(f1)) flags += 1 if flags >= 6: break model.load_state_dict(torch.load(cfg.model_save_pth)['state_dict']) test(model, processor, cfg_optim, label_list, tokenizer, device)
experiment_ranges = {} experiment_ranges["loss_control_penalty"] = [0.7] experiment_ranges["dense_layer_neurons"] = [1024] alpha = 0.9 # Only relevant for RMSProp, smoothing constant for weight update base_learningRate = 5e-6 experiment_ranges["learningRate"] = [1e-4] experiment_ranges["weight_decay"] = [5e-6] data_path = os.path.dirname(os.getcwd()) + "/data/" log_dir = os.path.dirname(os.getcwd()) + "/logs/" random_seed = 1905 # Sets all seeds to the chosen number set_seeds(random_seed) # If we wanna keep tensorboard logs if record_run: # Creates the logging directory os.makedirs(log_dir, exist_ok=True) # Creates this experiment's log directory experiment_log_dir = log_dir + "/" + create_exp_name(experiment_ranges) os.makedirs(experiment_log_dir, exist_ok=True) f = open(experiment_log_dir + "experiment_ranges.txt", "w") f.write(str(experiment_ranges)) f.close() # ## DATA LOADING ## # # We apply several transformations to both MNIST and SVHN #