def train_loop(index, *args): logger.debug("rank: %d entered train_loop", index) param_optimizer = list(k.model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [ {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'lr': 4e-5, 'weight_decay': 0.01}, {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'lr': 4e-5, 'weight_decay': 0.0} ] def AdamW_with_given_p(p_to_ignore, *args, **kargs): kargs['lr']=TrainGlobalConfig.lr*xm.xrt_world_size() return AdamW(optimizer_grouped_parameters, *args, **kargs) if index == 0: time.sleep(1) learn = k.create_learner(k, opt_func=AdamW_with_given_p, loss_func=LabelSmoothing(), wd=0.01, callback_fns=[partial(GradientClipping, clip=0.5), ShowGraph, partial(CSVLogger, append=True), partial(CheckGrad, skip_loss_step=False)] ).to_tpu_distributed() learn.lr_find(start_lr=1e-7, end_lr=1e-5, num_it=200) learn.recorder.plot() #learn.fit_one_cycle(3, max_lr=5e-6, wd=0.001) learn.fit(1, lr=5e-6, wd=0.001)
def setup_class(cls): try: cls.model = bert_torch.get_trained_model() except RuntimeError: cls.model = None cls.data = pack_data() logger.debug("Start Test bert multi lang")
def teardown_class(cls): #subprocess.run("pkill -f \"make ripdbrv\"", shell=True) try: del cls.model except Exception as e: print(e) logger.debug("tear down test %s", "Test_distilbert_model")
def get_toxic_comment(p, col_name="comment_text"): dtr = get_column(os.path.join(DATA_PATH, p), col_name=col_name) r = random.randint(0, len(dtr) - 1) logger.debug("toxic data example %s.[%s]:\n %s\n", p, col_name, dtr[r]) return dtr
def after_prepare_data_hook(self): """Put to databunch here""" logger.debug("kernel use device %s", self.device) self.data = DataBunch.create(self.train_dataset, self.validation_dataset, bs=self.config.batch_size, device=self.device, num_workers=self.config.num_workers)
def __init__(self, learn: Learner, skip_loss_step=False, batch_size=16): super().__init__(learn) self.skip_loss_step = skip_loss_step logger.debug("Init Callback CheckGrad with skip_loss_step: " + str(self.skip_loss_step)) self.losses = None self.final_scores = None self.batch_size = batch_size
def on_epoch_end(self, epoch, logger=logger): if epoch % self.interval == 0: y_pred = self.model.predict(self.X_val, verbose=0) if y_pred.size > self.y_val.size: y_pred = y_pred[:, 0] score = roc_auc_score(self.y_val, y_pred) print("\n ROC-AUC - epoch: {:d} - score: {:.6f}".format( epoch + 1, score)) logger.debug("\n ROC-AUC - epoch: {:d} - score: {:.6f}".format( epoch + 1, score))
def __init__(self, learn: Learner, debug=True): super().__init__(learn) self.debug = debug if debug: self.device = xm.xla_device(devkind='TPU') logger.debug("TPUDistributed in DEBUG mode") #self.device = xm.xla_device(devkind='CPU') else: self.device = xm.xla_device(devkind='TPU') logger.debug("%s used for xla_device for TPUDistributed" % self.device)
def on_train_begin(self, **kwargs: Any) -> None: self.learn.model = self.learn.model.to(self.device) pg = self.learn.opt.opt.param_groups pg0pl = pg[0]['params'] # pg0pl[0] is a Parameter pg1pl = pg[1]['params'] # pg0pl[0] is a Parameter #logger.debug("grad info: %s", raw_opt) logger.debug(f"on_train_begin pg0 lr: {pg[0]['lr']}") logger.debug(f"on_train_begin pg1 lr: {pg[1]['lr']}") if self.debug: self.learn.opt.lr = self.learn.opt.lr*xm.xrt_world_size() #pg[0]['lr'] *= xm.xrt_world_size() # will do it twice... #pg[1]['lr'] *= xm.xrt_world_size() logger.debug("opt info: %s\n type: %s", self.learn.opt, type(self.learn.opt)) else: self.learn.opt.lr = self.learn.opt.lr*xm.xrt_world_size() logger.debug("%s used for xla_device, to device done" % self.device) shuffle = self.data.train_dl.init_kwargs['shuffle'] if hasattr( self.data.train_dl, 'init_kwargs') else True self.old_sampler_train_dl, self.data.train_dl, self.train_sampler = _change_dl( self.k, self.data.train_dl, shuffle) if hasattr(self.data, 'valid_dl') and self.data.valid_dl is not None: self.old_sampler_valid_dl, self.data.valid_dl, self.valid_sampler = _change_dl_val( self.k, self.data.valid_dl, shuffle)
def on_epoch_begin(self, **kwargs: Any) -> None: logger.debug("Epoch begins on device %s" % self.device) self.old_train_dl = self.data.train_dl self.learn.data.train_dl = pl.ParallelLoader( self.old_train_dl, [self.device]).per_device_loader(self.device) self.learn.data.train_dl.dataset = None # self.old_train_dl.dataset if hasattr(self.data, 'valid_dl') and self.data.valid_dl is not None: self.old_valid_dl = self.learn.data.valid_dl self.learn.data.valid_dl = pl.ParallelLoader( self.old_valid_dl, [self.device]).per_device_loader(self.device) self.learn.data.valid_dl.dataset = self.old_valid_dl.dataset self.learn.data.valid_dl.dl = self.learn.data.valid_dl._loader._loader
def train_one_epoch(model, device, config, train_loader, criterion, optimizer): model.train() losses = AverageMeter() final_scores = RocAucMeter() t = time.time() for step, (inputs_masks, targets) in enumerate(train_loader): inputs=inputs_masks[0] attention_masks=inputs_masks[1] batch_size = inputs.size(0) if config.verbose: if step % config.verbose_step == 0: logger.debug( f'Train Step {step}, bs: {batch_size}, loss: ' + \ f"{losses.avg:.5f}, lr: {optimizer.param_groups[0]['lr']} final_score: {final_scores.avg:.5f}, mc_score: {final_scores.mc_avg:.5f}, " + \ f'time: {(time.time() - t):.5f}' ) inputs = inputs.to(device, dtype=torch.long) attention_masks = attention_masks.to(device, dtype=torch.long) targets = targets.to(device, dtype=torch.float) optimizer.zero_grad() outputs = model(inputs, attention_masks) loss = criterion(outputs, targets) final_scores.update(targets, outputs) losses.update(loss.detach().item(), batch_size) loss.backward() _check_grad(optimizer) #optimizer.step() xm.optimizer_step(optimizer, barrier=True) model.eval() #self.save('last-checkpoint.bin') return losses, final_scores
def test_fit_adv(self): # self.model_dev = build_distilbert_model_singleton(model_type="1st") if DEBUG: steps = 10 epochs = 1 else: steps = TRAIN_LEN//BATCH_SIZE logger.debug("Train len %s, batch size %s", TRAIN_LEN, BATCH_SIZE) epochs = 1 logger.debug("Every epoch, steps is %s", steps) train_history = self.model.fit( train_dataset, steps_per_epoch=steps, validation_data=valid_dataset, callbacks=bert_cbs, epochs=epochs )
def debug_train(use_dist_cb=True): logger.debug(f'debug train with{" " if use_dist_cb else "OUT"} to_tpu_distributed') from kaggle_runner import defaults _DEBUG = defaults.DEBUG #defaults.DEBUG = True param_optimizer = list(k.model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [ {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'lr': 0., 'weight_decay': 0.01}, {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'lr': 0., 'weight_decay': 0.0} ] def AdamW_with_given_p(p_to_ignore, *args, **kargs): kargs['lr']=TrainGlobalConfig.lr*8 #xm.xrt_world_size() return AdamW(optimizer_grouped_parameters, *args, **kargs) learn = k.create_learner(k, opt_func=AdamW_with_given_p, loss_func=LabelSmoothing(), wd=0.01, callback_fns=[partial(GradientClipping, clip=0.5), partial(CSVLogger, append=True), partial(GradientAccumulator, num_iterations=4), partial(CheckGrad, skip_loss_step=False, batch_size=k.config.batch_size)] ) k.learner = learn if use_dist_cb: learn = learn.to_tpu_distributed() else: learn = learn.to_gpu(k) #learn.callback_fns.append(CheckGrad) #print('hello') #learn.lr_find(start_lr=1e-7, end_lr=1e-2, num_it=200) #learn.recorder.plot() learn.fit_one_cycle(2, max_lr=3e-5) #learn.fit(1, lr=4e-5) # original 0.5*e-5*8=4*e-5 defaults.DEBUG = _DEBUG
def _check_grad(raw_opt): pg = raw_opt.param_groups pg0pl = pg[0]['params'] # pg0pl[0] is a Parameter pg1pl = pg[1]['params'] # pg0pl[0] is a Parameter with torch.no_grad(): #norms = torch.tensor([torch.norm(p) for p in pg0pl]) #may_debug() #logger.debug("%s", pg0pl[0].grad) #logger.debug("%s", pg0pl[0].data) normsg = torch.tensor( [torch.norm(p.grad) for p in pg0pl[:10] if p.grad is not None]) #logger.debug("params info pg0: norm std(%f) mean(%f)", *torch.std_mean(norms)) logger.debug("grad info pg0: norm std(%f) mean(%f)", *torch.std_mean(normsg)) #norms1 = torch.tensor([torch.norm(p) for p in pg1pl]) norms1g = torch.tensor( [torch.norm(p.grad) for p in pg1pl[:10] if p.grad is not None]) #logger.debug("params info pg1: norm std(%f) mean(%f)", *torch.std_mean(norms1)) logger.debug("grad info pg1: norm std(%f) mean(%f)", *torch.std_mean(norms1g))
def _load_state(cls, stage=None, file_name="run_state.pkl", logger=None): """ Args: file_name: return: the kernel object, need to continue (Default value = "run_state.pkl") stage: (Default value = None) logger: (Default value = None) Returns: : the kernel object, need to continue """ if stage is not None: file_name = f"run_state_{stage}.pkl" if logger is not None: logger.debug(f"restore from {file_name}") self = kernel_utils.get_obj_or_dump(filename=file_name) assert self is not None self.logger = logger return self
def on_backward_begin(self, **kwargs: Any) -> None: #print(kwargs.keys()) """dict_keys(['epoch', 'iteration', 'num_batch', 'skip_validate', 'n_epochs', 'pbar', 'metrics', 'stop_training', 'last_input', 'last_target', 'train', 'stop_epoch', 'skip_step', 'skip_zero', 'skip_bwd', 'last_output', 'last_loss', 'smooth_loss']) """ pg = self.learn.opt.opt.param_groups #logger.debug("grad info: %s", raw_opt) logger.debug(f"on_backward_begin lr: {pg[0]['lr']}") logger.debug("itr: %d, num_batch: %d, last loss: %f, smooth_loss: %f", kwargs['iteration'], kwargs['num_batch'], kwargs['last_loss'], kwargs['smooth_loss']) self.final_scores.update(kwargs['last_target'], kwargs['last_output']) self.losses.update(kwargs['last_loss'].detach().item(), self.batch_size) logger.debug(f"loss_avg: {self.losses.avg:.5f}, lr_pg0:" f"{pg[0]['lr']}, lr_pg1: {pg[1]['lr']}final_score:" f"{self.final_scores.avg:.5f}, mc_score:" f"{self.final_scores.mc_avg:.5f}")
def _mp_fn(rank, flags, k=k): device = xm.xla_device(devkind='TPU') logger.debug("%s used for xla_device" % device) net = k.model net.to(device) logger.debug("%s used for xla_device, to device done" % device) train_sampler = DistributedSamplerWrapper( sampler=BalanceClassSampler(labels=k.train_dataset.get_labels(), mode="downsampling"), num_replicas=xm.xrt_world_size(), rank=xm.get_ordinal(), shuffle=True ) train_loader = torch.utils.data.DataLoader( k.train_dataset, batch_size=TrainGlobalConfig.batch_size, sampler=train_sampler, pin_memory=False, drop_last=True, num_workers=TrainGlobalConfig.num_workers, ) validation_sampler = torch.utils.data.distributed.DistributedSampler( k.validation_dataset, num_replicas=xm.xrt_world_size(), rank=xm.get_ordinal(), shuffle=False ) validation_loader = torch.utils.data.DataLoader( k.validation_dataset, batch_size=TrainGlobalConfig.batch_size, sampler=validation_sampler, pin_memory=False, drop_last=False, num_workers=TrainGlobalConfig.num_workers ) validation_tune_sampler = torch.utils.data.distributed.DistributedSampler( k.validation_tune_dataset, num_replicas=xm.xrt_world_size(), rank=xm.get_ordinal(), shuffle=True ) validation_tune_loader = torch.utils.data.DataLoader( k.validation_tune_dataset, batch_size=TrainGlobalConfig.batch_size, sampler=validation_tune_sampler, pin_memory=False, drop_last=False, num_workers=TrainGlobalConfig.num_workers ) test_sampler = torch.utils.data.distributed.DistributedSampler( k.test_dataset, num_replicas=xm.xrt_world_size(), rank=xm.get_ordinal(), shuffle=False ) test_loader = torch.utils.data.DataLoader( k.test_dataset, batch_size=TrainGlobalConfig.batch_size, sampler=test_sampler, pin_memory=False, drop_last=False, num_workers=TrainGlobalConfig.num_workers ) logger.debug("rank: %d. Will create TPU Fitter", rank) if rank == 0: time.sleep(1) fitter = TPUFitter(model=net, device=device, config=TrainGlobalConfig) fitter.fit(train_loader, validation_loader) fitter.run_tuning_and_inference(test_loader, validation_tune_loader)
def setup_method(self, method): logger.debug("setup for method %s", method)
def test_model_fn(device=torch.device("cpu")): #device = xm.xla_device(devkind='TPU') #device=torch.device("xla") logger.debug("Device used: %s", device) #k.run(dump_flag=True) # it seems it cannot save right #k.run(dump_flag=False) #k.peek_data() self = k assert self.validation_dataset is not None #assert self.learner is not None net = k.model assert net is not None net.to(device) param_optimizer = list(self.model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [ {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.001}, {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} ] #optimizer = AdamW(optimizer_grouped_parameters, lr=TrainGlobalConfig.lr*xm.xrt_world_size()) optimizer = AdamW(optimizer_grouped_parameters, lr=TrainGlobalConfig.lr*8) train_loader = torch.utils.data.DataLoader( self.train_dataset, batch_size=TrainGlobalConfig.batch_size, shuffle=False, # sampler is set, so shuffle here should be False sampler=BalanceClassSampler(labels=k.train_dataset.get_labels(), mode="downsampling"), pin_memory=False, drop_last=True, num_workers=TrainGlobalConfig.num_workers, ) validation_loader = torch.utils.data.DataLoader( self.validation_dataset, batch_size=TrainGlobalConfig.batch_size, # sampler=validation_sampler, pin_memory=False, drop_last=False, num_workers=TrainGlobalConfig.num_workers ) test_loader = torch.utils.data.DataLoader( self.test_dataset, batch_size=TrainGlobalConfig.batch_size, # sampler=test_sampler, pin_memory=False, drop_last=False, num_workers=TrainGlobalConfig.num_workers ) validation_tune_loader = torch.utils.data.DataLoader( self.validation_tune_dataset, batch_size=TrainGlobalConfig.batch_size, #sampler=validation_tune_sampler, pin_memory=False, drop_last=False, num_workers=TrainGlobalConfig.num_workers ) def validation(model, device, config, val_loader, criterion): model.eval() losses = AverageMeter() final_scores = RocAucMeter() t = time.time() for step, (inputs_masks, targets) in enumerate(val_loader): inputs=inputs_masks[0] attention_masks=inputs_masks[1] if config.verbose: if step % config.verbose_step == 0: logger.info( f'Valid Step {step}, loss: ' + \ f'{losses.avg:.5f}, final_score: {final_scores.avg:.5f}, mc_score: {final_scores.mc_avg:.5f}, ' + \ f'time: {(time.time() - t):.5f}' ) with torch.no_grad(): inputs = inputs.to(device, dtype=torch.long) attention_masks = attention_masks.to(device, dtype=torch.long) targets = targets.to(device, dtype=torch.float) outputs = model(inputs, attention_masks) loss = criterion(outputs, targets) batch_size = inputs.size(0) final_scores.update(targets, outputs) losses.update(loss.detach().item(), batch_size) def run_inference(model, device, config, test_loader): model.eval() result = {'id': [], 'toxic': []} t = time.time() for step, (inputs_masks, ids) in enumerate(test_loader): inputs=inputs_masks[0] attention_masks=inputs_masks[1] if config.verbose: if step % config.verbose_step == 0: logger.info(f'Prediction Step {step}, time: {(time.time() - t):.5f}') with torch.no_grad(): inputs = inputs.to(device, dtype=torch.long) attention_masks = attention_masks.to(device, dtype=torch.long) outputs = model(inputs, attention_masks) toxics = nn.functional.softmax(outputs, dim=1).data.cpu().numpy()[:,1] result['id'].extend(ids.cpu().numpy()) result['toxic'].extend(toxics) break # just test one batch return result def train_one_epoch(model, device, config, train_loader, criterion, optimizer): model.train() losses = AverageMeter() final_scores = RocAucMeter() t = time.time() for step, (inputs_masks, targets) in enumerate(train_loader): inputs=inputs_masks[0] attention_masks=inputs_masks[1] batch_size = inputs.size(0) if config.verbose: if step % config.verbose_step == 0: logger.debug( f'Train Step {step}, bs: {batch_size}, loss: ' + \ f"{losses.avg:.5f}, lr: {optimizer.param_groups[0]['lr']} final_score: {final_scores.avg:.5f}, mc_score: {final_scores.mc_avg:.5f}, " + \ f'time: {(time.time() - t):.5f}' ) inputs = inputs.to(device, dtype=torch.long) attention_masks = attention_masks.to(device, dtype=torch.long) targets = targets.to(device, dtype=torch.float) optimizer.zero_grad() outputs = model(inputs, attention_masks) loss = criterion(outputs, targets) final_scores.update(targets, outputs) losses.update(loss.detach().item(), batch_size) loss.backward() _check_grad(optimizer) #optimizer.step() xm.optimizer_step(optimizer, barrier=True) model.eval() #self.save('last-checkpoint.bin') return losses, final_scores def run_tuning_and_inference(net, device, TrainGlobalConfig, validation_loader, train_loader): for e in range(1): self.optimizer.param_groups[0]['lr'] = self.config.lr*8 losses, final_scores = train_one_epoch(net, device, TrainGlobalConfig, train_loader, TrainGlobalConfig.criterion, ) self.log(f'[RESULT]: Tune_Train. Epoch: {self.epoch}, loss: {losses.avg:.5f}, final_score: {final_scores.avg:.5f}, mc_score: {final_scores.mc_avg:.5f}, time: {(time.time() - t):.5f}') t = time.time() para_loader = pl.ParallelLoader(validation_loader, [self.device]) losses, final_scores = self.validation(para_loader.per_device_loader(self.device)) self.log(f'[RESULT]: Tune_Validation. Epoch: {self.epoch}, loss: {losses.avg:.5f}, final_score: {final_scores.avg:.5f}, mc_score: {final_scores.mc_avg:.5f}, time: {(time.time() - t):.5f}') run_inference(net, device, TrainGlobalConfig, validation_loader) #train_one_epoch(net, device, TrainGlobalConfig, train_loader, TrainGlobalConfig.criterion, optimizer) #losses, final_scores = validation(net, device, TrainGlobalConfig, validation_loader, TrainGlobalConfig.criterion) #logger.info(f"Val results: losses={losses}, final_scores={final_scores}") results = run_inference(net, device, TrainGlobalConfig, validation_loader) logger.info(f"Test done, result len %d", len(results))
def teardown_method(self, method): logger.debug("teardown method %s", method)
entry_str += r"""PS4='Line ${LINENO}: ' bash -x gdrive_setup >>loggdrive &""" with open("entry.sh", "w") as f: f.write(entry_str) # + import os import sys sys.path.append(os.getcwd()) import selectors import subprocess from importlib import reload, import_module import_module('kaggle_runner') from kaggle_runner import logger logger.debug("Logger loaded. Will run entry.sh.") # + p = subprocess.Popen('bash -x entry.sh', stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True) sel = selectors.DefaultSelector() sel.register(p.stdout, selectors.EVENT_READ) sel.register(p.stderr, selectors.EVENT_READ) while True: for key, _ in sel.select(): try: data = key.fileobj.read1(2048).decode()
def for_pytorch(data_package, device=torch.device('cuda'), SEED=118, phase="predict", model=None): if device is None and os.getenv("TPU_NAME") is not None: import torch_xla # model import torch_xla.core.xla_model as xm device = xm.xla_device() X, y, X_val, y_val, X_test = data_package if model is None: try: model = get_trained_model(device=device) except RuntimeError as e: logger.debug("%s", e) if model is not None and phase == "predict": for param in model.parameters(): param.requires_grad = False model.eval() valid_preds = np.zeros((len(X_val))) valid = torch.utils.data.TensorDataset( torch.tensor(X_val, dtype=torch.long)) valid_loader = torch.utils.data.DataLoader(valid, batch_size=32, shuffle=False) tk0 = tqdm(valid_loader) for i, (x_batch, ) in enumerate(tk0): pred = model(x_batch.to(device), attention_mask=(x_batch > 0).to(device), labels=None) valid_preds[i * 32:(i + 1) * 32] = pred[:, 0].detach().cpu().squeeze().numpy() else: import subprocess train_dataset = torch.utils.data.TensorDataset( torch.tensor(X, dtype=torch.long), torch.tensor(y, dtype=torch.float)) output_model_file = "bert_pytorch.bin" lr = 1e-5 batch_size = 32 accumulation_steps = 3 np.random.seed(SEED) torch.manual_seed(SEED) torch.cuda.manual_seed(SEED) torch.backends.cudnn.deterministic = False if model is None: prepare_pretrained() model = BertForSequenceClassification.from_pretrained( ".", cache_dir=None, num_labels=1 if len(y[0]) < 1 else len(y[0])) assert model is not None logger.info("AUC for valication: %f", get_validation_result(model, X_val, y_val)) model.zero_grad() model = model.to(device) param_optimizer = list(model.named_parameters()) may_debug() req_grad = ['layer.10', 'layer.11', 'bert.poole', 'classifier'] no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] def para_opt_configure(req_grad, no_decay): for n, p in param_optimizer: if any(nd in n for nd in req_grad): p.requires_grad = True else: p.requires_grad = False optimizer_grouped_parameters = [{ 'params': [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) ], 'weight_decay': 0.01 }, { 'params': [ p for n, p in param_optimizer if any(nd in n for nd in no_decay) ], 'weight_decay': 0.0 }] return optimizer_grouped_parameters optimizer_grouped_parameters = para_opt_configure(req_grad, no_decay) train = train_dataset num_train_optimization_steps = int(EPOCHS * len(train) / batch_size / accumulation_steps) optimizer = BertAdam(optimizer_grouped_parameters, lr=lr, warmup=0.05, t_total=num_train_optimization_steps) subprocess.run( 'python3 -m pip show apex || ([ -d /kaggle/input/nvidiaapex/repository/NVIDIA-apex-39e153a ] && ' 'pip install -v --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" ../input/nvidiaapex/repository/NVIDIA-apex-39e153a)', shell=True, check=True) from apex import amp # automatic mix precision model, optimizer = amp.initialize(model, optimizer, opt_level="O1", verbosity=1) model = model.train() tq = tqdm(range(EPOCHS)) for epoch in tq: train_loader = torch.utils.data.DataLoader(train, batch_size=batch_size, shuffle=True) avg_loss = 0. avg_accuracy = 0. lossf = None para_opt_configure(req_grad, no_decay) # valication will change it tk0 = tqdm(enumerate(train_loader), total=len(train_loader), leave=True) optimizer.zero_grad() # Bug fix - thanks to @chinhuic for i, (x_batch, y_batch) in tk0: # optimizer.zero_grad() y_pred = model(x_batch.to(device), attention_mask=(x_batch > 0).to(device), labels=None) loss = F.binary_cross_entropy_with_logits( y_pred, y_batch.to(device)) with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() if ( i + 1 ) % accumulation_steps == 0: # Wait for several backward steps optimizer.step() # Now we can do an optimizer step optimizer.zero_grad() if lossf: lossf = 0.98 * lossf + 0.02 * loss.item() else: lossf = loss.item() tk0.set_postfix(loss=lossf) avg_loss += loss.item() / len(train_loader) avg_accuracy += torch.mean( ((torch.sigmoid(y_pred[:, 0]) > 0.5) == (y_batch[:, 0] > 0.5).to(device)).to( torch.float)).item() / len(train_loader) tq.set_postfix(avg_loss=avg_loss, avg_accuracy=avg_accuracy) logger.info("AUC for valication: %f", get_validation_result(model, X_val, y_val)) from datetime import date today = date.today() torch.save(model.state_dict(), f"{today}_{output_model_file}")