def train_loop(index, *args):
        logger.debug("rank: %d entered train_loop", index)

        param_optimizer = list(k.model.named_parameters())
        no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
        optimizer_grouped_parameters = [
            {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'lr': 4e-5, 'weight_decay': 0.01},
            {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'lr': 4e-5, 'weight_decay': 0.0}
        ]

        def AdamW_with_given_p(p_to_ignore, *args, **kargs):
            kargs['lr']=TrainGlobalConfig.lr*xm.xrt_world_size()

            return AdamW(optimizer_grouped_parameters, *args, **kargs)

        if index == 0:
            time.sleep(1)
        learn = k.create_learner(k, opt_func=AdamW_with_given_p,
                                 loss_func=LabelSmoothing(),
                                 wd=0.01,
                                 callback_fns=[partial(GradientClipping, clip=0.5),
                                               ShowGraph,
                                               partial(CSVLogger, append=True),
                                               partial(CheckGrad, skip_loss_step=False)]
                                 ).to_tpu_distributed()
        learn.lr_find(start_lr=1e-7, end_lr=1e-5, num_it=200)
        learn.recorder.plot()
        #learn.fit_one_cycle(3, max_lr=5e-6, wd=0.001)
        learn.fit(1, lr=5e-6, wd=0.001)
Exemplo n.º 2
0
 def setup_class(cls):
     try:
         cls.model = bert_torch.get_trained_model()
     except RuntimeError:
         cls.model = None
     cls.data = pack_data()
     logger.debug("Start Test bert multi lang")
Exemplo n.º 3
0
 def teardown_class(cls):
     #subprocess.run("pkill -f \"make ripdbrv\"", shell=True)
     try:
         del cls.model
     except Exception as e:
         print(e)
     logger.debug("tear down test %s", "Test_distilbert_model")
Exemplo n.º 4
0
def get_toxic_comment(p, col_name="comment_text"):
    dtr = get_column(os.path.join(DATA_PATH, p), col_name=col_name)

    r = random.randint(0, len(dtr) - 1)
    logger.debug("toxic data example %s.[%s]:\n %s\n", p, col_name, dtr[r])

    return dtr
Exemplo n.º 5
0
 def after_prepare_data_hook(self):
     """Put to databunch here"""
     logger.debug("kernel use device %s", self.device)
     self.data = DataBunch.create(self.train_dataset,
                                  self.validation_dataset,
                                  bs=self.config.batch_size,
                                  device=self.device,
                                  num_workers=self.config.num_workers)
Exemplo n.º 6
0
 def __init__(self, learn: Learner, skip_loss_step=False, batch_size=16):
     super().__init__(learn)
     self.skip_loss_step = skip_loss_step
     logger.debug("Init Callback CheckGrad with skip_loss_step: " +
                  str(self.skip_loss_step))
     self.losses = None
     self.final_scores = None
     self.batch_size = batch_size
Exemplo n.º 7
0
    def on_epoch_end(self, epoch, logger=logger):
        if epoch % self.interval == 0:
            y_pred = self.model.predict(self.X_val, verbose=0)

            if y_pred.size > self.y_val.size:
                y_pred = y_pred[:, 0]
            score = roc_auc_score(self.y_val, y_pred)
            print("\n ROC-AUC - epoch: {:d} - score: {:.6f}".format(
                epoch + 1, score))
            logger.debug("\n ROC-AUC - epoch: {:d} - score: {:.6f}".format(
                epoch + 1, score))
Exemplo n.º 8
0
    def __init__(self, learn: Learner, debug=True):
        super().__init__(learn)

        self.debug = debug

        if debug:
            self.device = xm.xla_device(devkind='TPU')
            logger.debug("TPUDistributed in DEBUG mode")
            #self.device = xm.xla_device(devkind='CPU')
        else:
            self.device = xm.xla_device(devkind='TPU')
        logger.debug("%s used for xla_device for TPUDistributed" % self.device)
Exemplo n.º 9
0
    def on_train_begin(self, **kwargs: Any) -> None:
        self.learn.model = self.learn.model.to(self.device)

        pg = self.learn.opt.opt.param_groups
        pg0pl = pg[0]['params']  # pg0pl[0] is a Parameter
        pg1pl = pg[1]['params']  # pg0pl[0] is a Parameter

        #logger.debug("grad info: %s", raw_opt)
        logger.debug(f"on_train_begin pg0 lr: {pg[0]['lr']}")
        logger.debug(f"on_train_begin pg1 lr: {pg[1]['lr']}")

        if self.debug:
            self.learn.opt.lr = self.learn.opt.lr*xm.xrt_world_size()
            #pg[0]['lr'] *= xm.xrt_world_size() # will do it twice...
            #pg[1]['lr'] *= xm.xrt_world_size()
            logger.debug("opt info: %s\n type: %s",
                         self.learn.opt, type(self.learn.opt))
        else:
            self.learn.opt.lr = self.learn.opt.lr*xm.xrt_world_size()

        logger.debug("%s used for xla_device, to device done" % self.device)

        shuffle = self.data.train_dl.init_kwargs['shuffle'] if hasattr(
            self.data.train_dl, 'init_kwargs') else True
        self.old_sampler_train_dl, self.data.train_dl, self.train_sampler = _change_dl(
            self.k,
            self.data.train_dl, shuffle)

        if hasattr(self.data, 'valid_dl') and self.data.valid_dl is not None:
            self.old_sampler_valid_dl, self.data.valid_dl, self.valid_sampler = _change_dl_val(
                self.k, self.data.valid_dl, shuffle)
Exemplo n.º 10
0
    def on_epoch_begin(self, **kwargs: Any) -> None:
        logger.debug("Epoch begins on device %s" % self.device)

        self.old_train_dl = self.data.train_dl
        self.learn.data.train_dl = pl.ParallelLoader(

            self.old_train_dl, [self.device]).per_device_loader(self.device)
        self.learn.data.train_dl.dataset = None  # self.old_train_dl.dataset

        if hasattr(self.data, 'valid_dl') and self.data.valid_dl is not None:
            self.old_valid_dl = self.learn.data.valid_dl
            self.learn.data.valid_dl = pl.ParallelLoader(
                self.old_valid_dl, [self.device]).per_device_loader(self.device)

            self.learn.data.valid_dl.dataset = self.old_valid_dl.dataset
            self.learn.data.valid_dl.dl = self.learn.data.valid_dl._loader._loader
    def train_one_epoch(model, device, config, train_loader, criterion, optimizer):
        model.train()

        losses = AverageMeter()
        final_scores = RocAucMeter()
        t = time.time()

        for step, (inputs_masks, targets) in enumerate(train_loader):
            inputs=inputs_masks[0]
            attention_masks=inputs_masks[1]

            batch_size = inputs.size(0)

            if config.verbose:
                if step % config.verbose_step == 0:
                    logger.debug(
                        f'Train Step {step}, bs: {batch_size}, loss: ' + \
                        f"{losses.avg:.5f}, lr: {optimizer.param_groups[0]['lr']} final_score: {final_scores.avg:.5f}, mc_score: {final_scores.mc_avg:.5f}, " + \
                        f'time: {(time.time() - t):.5f}'
                    )

            inputs = inputs.to(device, dtype=torch.long)
            attention_masks = attention_masks.to(device, dtype=torch.long)
            targets = targets.to(device, dtype=torch.float)

            optimizer.zero_grad()

            outputs = model(inputs, attention_masks)
            loss = criterion(outputs, targets)


            final_scores.update(targets, outputs)

            losses.update(loss.detach().item(), batch_size)

            loss.backward()
            _check_grad(optimizer)
            #optimizer.step()
            xm.optimizer_step(optimizer, barrier=True)

        model.eval()
        #self.save('last-checkpoint.bin')

        return losses, final_scores
Exemplo n.º 12
0
    def test_fit_adv(self):
        # self.model_dev = build_distilbert_model_singleton(model_type="1st")

        if DEBUG:
            steps = 10
            epochs = 1
        else:
            steps = TRAIN_LEN//BATCH_SIZE
            logger.debug("Train len %s, batch size %s", TRAIN_LEN, BATCH_SIZE)
            epochs = 1
        logger.debug("Every epoch, steps is %s", steps)

        train_history = self.model.fit(
            train_dataset,
            steps_per_epoch=steps,
            validation_data=valid_dataset,
            callbacks=bert_cbs,
            epochs=epochs
        )
    def debug_train(use_dist_cb=True):
        logger.debug(f'debug train with{" " if use_dist_cb else "OUT"} to_tpu_distributed')
        from kaggle_runner import defaults
        _DEBUG = defaults.DEBUG
        #defaults.DEBUG = True

        param_optimizer = list(k.model.named_parameters())
        no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
        optimizer_grouped_parameters = [
            {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'lr': 0., 'weight_decay': 0.01},
            {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'lr': 0., 'weight_decay': 0.0}
        ]

        def AdamW_with_given_p(p_to_ignore, *args, **kargs):
            kargs['lr']=TrainGlobalConfig.lr*8 #xm.xrt_world_size()

            return AdamW(optimizer_grouped_parameters, *args, **kargs)

        learn = k.create_learner(k, opt_func=AdamW_with_given_p,
                                 loss_func=LabelSmoothing(),
                                 wd=0.01,
                                 callback_fns=[partial(GradientClipping, clip=0.5),
                                               partial(CSVLogger, append=True),
                                               partial(GradientAccumulator, num_iterations=4),
                                               partial(CheckGrad, skip_loss_step=False, batch_size=k.config.batch_size)]
                                 )
        k.learner = learn

        if use_dist_cb:
            learn = learn.to_tpu_distributed()
        else:
            learn = learn.to_gpu(k)

        #learn.callback_fns.append(CheckGrad)
        #print('hello')
        #learn.lr_find(start_lr=1e-7, end_lr=1e-2, num_it=200)
        #learn.recorder.plot()
        learn.fit_one_cycle(2, max_lr=3e-5)
        #learn.fit(1, lr=4e-5) # original 0.5*e-5*8=4*e-5
        defaults.DEBUG = _DEBUG
Exemplo n.º 14
0
def _check_grad(raw_opt):
    pg = raw_opt.param_groups
    pg0pl = pg[0]['params']  # pg0pl[0] is a Parameter
    pg1pl = pg[1]['params']  # pg0pl[0] is a Parameter

    with torch.no_grad():
        #norms = torch.tensor([torch.norm(p) for p in pg0pl])
        #may_debug()
        #logger.debug("%s", pg0pl[0].grad)
        #logger.debug("%s", pg0pl[0].data)
        normsg = torch.tensor(
            [torch.norm(p.grad) for p in pg0pl[:10] if p.grad is not None])
        #logger.debug("params info pg0: norm std(%f) mean(%f)", *torch.std_mean(norms))
        logger.debug("grad info pg0: norm std(%f) mean(%f)",
                     *torch.std_mean(normsg))

        #norms1 = torch.tensor([torch.norm(p) for p in pg1pl])
        norms1g = torch.tensor(
            [torch.norm(p.grad) for p in pg1pl[:10] if p.grad is not None])
        #logger.debug("params info pg1: norm std(%f) mean(%f)", *torch.std_mean(norms1))
        logger.debug("grad info pg1: norm std(%f) mean(%f)",
                     *torch.std_mean(norms1g))
Exemplo n.º 15
0
    def _load_state(cls, stage=None, file_name="run_state.pkl", logger=None):
        """

        Args:
          file_name: return: the kernel object, need to continue (Default value = "run_state.pkl")
          stage: (Default value = None)
          logger: (Default value = None)

        Returns:
          : the kernel object, need to continue

        """

        if stage is not None:
            file_name = f"run_state_{stage}.pkl"

        if logger is not None:
            logger.debug(f"restore from {file_name}")
        self = kernel_utils.get_obj_or_dump(filename=file_name)
        assert self is not None
        self.logger = logger

        return self
Exemplo n.º 16
0
    def on_backward_begin(self, **kwargs: Any) -> None:
        #print(kwargs.keys())
        """dict_keys(['epoch', 'iteration', 'num_batch', 'skip_validate',
        'n_epochs', 'pbar', 'metrics', 'stop_training', 'last_input',
        'last_target', 'train', 'stop_epoch', 'skip_step', 'skip_zero',
        'skip_bwd', 'last_output', 'last_loss', 'smooth_loss'])
        """
        pg = self.learn.opt.opt.param_groups
        #logger.debug("grad info: %s", raw_opt)
        logger.debug(f"on_backward_begin lr: {pg[0]['lr']}")
        logger.debug("itr: %d, num_batch: %d, last loss: %f, smooth_loss: %f",
                     kwargs['iteration'], kwargs['num_batch'],
                     kwargs['last_loss'], kwargs['smooth_loss'])

        self.final_scores.update(kwargs['last_target'], kwargs['last_output'])
        self.losses.update(kwargs['last_loss'].detach().item(),
                           self.batch_size)
        logger.debug(f"loss_avg: {self.losses.avg:.5f}, lr_pg0:"
                     f"{pg[0]['lr']}, lr_pg1: {pg[1]['lr']}final_score:"
                     f"{self.final_scores.avg:.5f}, mc_score:"
                     f"{self.final_scores.mc_avg:.5f}")
    def _mp_fn(rank, flags, k=k):
        device = xm.xla_device(devkind='TPU')
        logger.debug("%s used for xla_device" % device)
        net = k.model
        net.to(device)
        logger.debug("%s used for xla_device, to device done" % device)

        train_sampler = DistributedSamplerWrapper(
            sampler=BalanceClassSampler(labels=k.train_dataset.get_labels(), mode="downsampling"),
            num_replicas=xm.xrt_world_size(),
            rank=xm.get_ordinal(),
            shuffle=True
        )
        train_loader = torch.utils.data.DataLoader(
            k.train_dataset,
            batch_size=TrainGlobalConfig.batch_size,
            sampler=train_sampler,
            pin_memory=False,
            drop_last=True,
            num_workers=TrainGlobalConfig.num_workers,
        )
        validation_sampler = torch.utils.data.distributed.DistributedSampler(
            k.validation_dataset,
            num_replicas=xm.xrt_world_size(),
            rank=xm.get_ordinal(),
            shuffle=False
        )
        validation_loader = torch.utils.data.DataLoader(
            k.validation_dataset,
            batch_size=TrainGlobalConfig.batch_size,
            sampler=validation_sampler,
            pin_memory=False,
            drop_last=False,
            num_workers=TrainGlobalConfig.num_workers
        )
        validation_tune_sampler = torch.utils.data.distributed.DistributedSampler(
            k.validation_tune_dataset,
            num_replicas=xm.xrt_world_size(),
            rank=xm.get_ordinal(),
            shuffle=True
        )
        validation_tune_loader = torch.utils.data.DataLoader(
            k.validation_tune_dataset,
            batch_size=TrainGlobalConfig.batch_size,
            sampler=validation_tune_sampler,
            pin_memory=False,
            drop_last=False,
            num_workers=TrainGlobalConfig.num_workers
        )
        test_sampler = torch.utils.data.distributed.DistributedSampler(
            k.test_dataset,
            num_replicas=xm.xrt_world_size(),
            rank=xm.get_ordinal(),
            shuffle=False
        )
        test_loader = torch.utils.data.DataLoader(
            k.test_dataset,
            batch_size=TrainGlobalConfig.batch_size,
            sampler=test_sampler,
            pin_memory=False,
            drop_last=False,
            num_workers=TrainGlobalConfig.num_workers
        )

        logger.debug("rank: %d. Will create TPU Fitter", rank)

        if rank == 0:
            time.sleep(1)

        fitter = TPUFitter(model=net, device=device, config=TrainGlobalConfig)
        fitter.fit(train_loader, validation_loader)
        fitter.run_tuning_and_inference(test_loader, validation_tune_loader)
Exemplo n.º 18
0
 def setup_method(self, method):
     logger.debug("setup for method %s", method)
def test_model_fn(device=torch.device("cpu")):
    #device = xm.xla_device(devkind='TPU')
    #device=torch.device("xla")
    logger.debug("Device used: %s", device)

    #k.run(dump_flag=True) # it seems it cannot save right
    #k.run(dump_flag=False)
    #k.peek_data()

    self = k
    assert self.validation_dataset is not None
    #assert self.learner is not None

    net = k.model
    assert net is not None
    net.to(device)

    param_optimizer = list(self.model.named_parameters())
    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [
        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.001},
        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
    ]
    #optimizer = AdamW(optimizer_grouped_parameters, lr=TrainGlobalConfig.lr*xm.xrt_world_size())
    optimizer = AdamW(optimizer_grouped_parameters, lr=TrainGlobalConfig.lr*8)

    train_loader = torch.utils.data.DataLoader(
        self.train_dataset,
        batch_size=TrainGlobalConfig.batch_size,
        shuffle=False, # sampler is set, so shuffle here should be False
        sampler=BalanceClassSampler(labels=k.train_dataset.get_labels(), mode="downsampling"),
        pin_memory=False,
        drop_last=True,
        num_workers=TrainGlobalConfig.num_workers,
    )
    validation_loader = torch.utils.data.DataLoader(
        self.validation_dataset,
        batch_size=TrainGlobalConfig.batch_size,
    #    sampler=validation_sampler,
        pin_memory=False,
        drop_last=False,
        num_workers=TrainGlobalConfig.num_workers
    )
    test_loader = torch.utils.data.DataLoader(
        self.test_dataset,
        batch_size=TrainGlobalConfig.batch_size,
    #    sampler=test_sampler,
        pin_memory=False,
        drop_last=False,
        num_workers=TrainGlobalConfig.num_workers
    )
    validation_tune_loader = torch.utils.data.DataLoader(
        self.validation_tune_dataset,
        batch_size=TrainGlobalConfig.batch_size,
        #sampler=validation_tune_sampler,
        pin_memory=False,
        drop_last=False,
        num_workers=TrainGlobalConfig.num_workers
    )

    def validation(model, device, config, val_loader, criterion):
        model.eval()
        losses = AverageMeter()
        final_scores = RocAucMeter()

        t = time.time()

        for step, (inputs_masks, targets) in enumerate(val_loader):
            inputs=inputs_masks[0]
            attention_masks=inputs_masks[1]

            if config.verbose:
                if step % config.verbose_step == 0:
                    logger.info(
                        f'Valid Step {step}, loss: ' + \
                        f'{losses.avg:.5f}, final_score: {final_scores.avg:.5f}, mc_score: {final_scores.mc_avg:.5f}, ' + \
                        f'time: {(time.time() - t):.5f}'
                    )
            with torch.no_grad():
                inputs = inputs.to(device, dtype=torch.long)
                attention_masks = attention_masks.to(device, dtype=torch.long)
                targets = targets.to(device, dtype=torch.float)

                outputs = model(inputs, attention_masks)
                loss = criterion(outputs, targets)

                batch_size = inputs.size(0)

                final_scores.update(targets, outputs)
                losses.update(loss.detach().item(), batch_size)

    def run_inference(model, device, config, test_loader):
        model.eval()
        result = {'id': [], 'toxic': []}
        t = time.time()

        for step, (inputs_masks, ids) in enumerate(test_loader):
            inputs=inputs_masks[0]
            attention_masks=inputs_masks[1]

            if config.verbose:
                if step % config.verbose_step == 0:
                    logger.info(f'Prediction Step {step}, time: {(time.time() - t):.5f}')

            with torch.no_grad():
                inputs = inputs.to(device, dtype=torch.long)
                attention_masks = attention_masks.to(device, dtype=torch.long)
                outputs = model(inputs, attention_masks)
                toxics = nn.functional.softmax(outputs, dim=1).data.cpu().numpy()[:,1]

            result['id'].extend(ids.cpu().numpy())
            result['toxic'].extend(toxics)

            break # just test one batch

        return result

    def train_one_epoch(model, device, config, train_loader, criterion, optimizer):
        model.train()

        losses = AverageMeter()
        final_scores = RocAucMeter()
        t = time.time()

        for step, (inputs_masks, targets) in enumerate(train_loader):
            inputs=inputs_masks[0]
            attention_masks=inputs_masks[1]

            batch_size = inputs.size(0)

            if config.verbose:
                if step % config.verbose_step == 0:
                    logger.debug(
                        f'Train Step {step}, bs: {batch_size}, loss: ' + \
                        f"{losses.avg:.5f}, lr: {optimizer.param_groups[0]['lr']} final_score: {final_scores.avg:.5f}, mc_score: {final_scores.mc_avg:.5f}, " + \
                        f'time: {(time.time() - t):.5f}'
                    )

            inputs = inputs.to(device, dtype=torch.long)
            attention_masks = attention_masks.to(device, dtype=torch.long)
            targets = targets.to(device, dtype=torch.float)

            optimizer.zero_grad()

            outputs = model(inputs, attention_masks)
            loss = criterion(outputs, targets)


            final_scores.update(targets, outputs)

            losses.update(loss.detach().item(), batch_size)

            loss.backward()
            _check_grad(optimizer)
            #optimizer.step()
            xm.optimizer_step(optimizer, barrier=True)

        model.eval()
        #self.save('last-checkpoint.bin')

        return losses, final_scores

    def run_tuning_and_inference(net, device, TrainGlobalConfig, validation_loader, train_loader):
        for e in range(1):
            self.optimizer.param_groups[0]['lr'] = self.config.lr*8

            losses, final_scores = train_one_epoch(net, device, TrainGlobalConfig, train_loader, TrainGlobalConfig.criterion, )
            self.log(f'[RESULT]: Tune_Train. Epoch: {self.epoch}, loss: {losses.avg:.5f}, final_score: {final_scores.avg:.5f}, mc_score: {final_scores.mc_avg:.5f}, time: {(time.time() - t):.5f}')

            t = time.time()
            para_loader = pl.ParallelLoader(validation_loader, [self.device])
            losses, final_scores = self.validation(para_loader.per_device_loader(self.device))
            self.log(f'[RESULT]: Tune_Validation. Epoch: {self.epoch}, loss: {losses.avg:.5f}, final_score: {final_scores.avg:.5f}, mc_score: {final_scores.mc_avg:.5f}, time: {(time.time() - t):.5f}')

            run_inference(net, device, TrainGlobalConfig, validation_loader)

    #train_one_epoch(net, device, TrainGlobalConfig, train_loader, TrainGlobalConfig.criterion, optimizer)
    #losses, final_scores = validation(net, device, TrainGlobalConfig, validation_loader, TrainGlobalConfig.criterion)
    #logger.info(f"Val results: losses={losses}, final_scores={final_scores}")

    results = run_inference(net, device, TrainGlobalConfig, validation_loader)
    logger.info(f"Test done, result len %d", len(results))
Exemplo n.º 20
0
 def teardown_method(self, method):
     logger.debug("teardown method %s", method)
Exemplo n.º 21
0
    entry_str += r"""PS4='Line ${LINENO}: ' bash -x gdrive_setup >>loggdrive &"""

with open("entry.sh", "w") as f:
    f.write(entry_str)

# +
import os
import sys
sys.path.append(os.getcwd())

import selectors
import subprocess
from importlib import reload, import_module
import_module('kaggle_runner')
from kaggle_runner import logger
logger.debug("Logger loaded. Will run entry.sh.")

# +
p = subprocess.Popen('bash -x entry.sh',
                     stdout=subprocess.PIPE,
                     stderr=subprocess.PIPE,
                     shell=True)

sel = selectors.DefaultSelector()
sel.register(p.stdout, selectors.EVENT_READ)
sel.register(p.stderr, selectors.EVENT_READ)

while True:
    for key, _ in sel.select():
        try:
            data = key.fileobj.read1(2048).decode()
Exemplo n.º 22
0
def for_pytorch(data_package,
                device=torch.device('cuda'),
                SEED=118,
                phase="predict",
                model=None):

    if device is None and os.getenv("TPU_NAME") is not None:
        import torch_xla  # model
        import torch_xla.core.xla_model as xm
        device = xm.xla_device()

    X, y, X_val, y_val, X_test = data_package

    if model is None:
        try:
            model = get_trained_model(device=device)
        except RuntimeError as e:
            logger.debug("%s", e)

    if model is not None and phase == "predict":
        for param in model.parameters():
            param.requires_grad = False
        model.eval()
        valid_preds = np.zeros((len(X_val)))
        valid = torch.utils.data.TensorDataset(
            torch.tensor(X_val, dtype=torch.long))
        valid_loader = torch.utils.data.DataLoader(valid,
                                                   batch_size=32,
                                                   shuffle=False)

        tk0 = tqdm(valid_loader)

        for i, (x_batch, ) in enumerate(tk0):
            pred = model(x_batch.to(device),
                         attention_mask=(x_batch > 0).to(device),
                         labels=None)
            valid_preds[i * 32:(i + 1) *
                        32] = pred[:, 0].detach().cpu().squeeze().numpy()
    else:
        import subprocess
        train_dataset = torch.utils.data.TensorDataset(
            torch.tensor(X, dtype=torch.long),
            torch.tensor(y, dtype=torch.float))
        output_model_file = "bert_pytorch.bin"

        lr = 1e-5
        batch_size = 32
        accumulation_steps = 3
        np.random.seed(SEED)
        torch.manual_seed(SEED)
        torch.cuda.manual_seed(SEED)
        torch.backends.cudnn.deterministic = False

        if model is None:
            prepare_pretrained()
            model = BertForSequenceClassification.from_pretrained(
                ".",
                cache_dir=None,
                num_labels=1 if len(y[0]) < 1 else len(y[0]))
            assert model is not None
        logger.info("AUC for valication: %f",
                    get_validation_result(model, X_val, y_val))
        model.zero_grad()
        model = model.to(device)

        param_optimizer = list(model.named_parameters())
        may_debug()

        req_grad = ['layer.10', 'layer.11', 'bert.poole', 'classifier']
        no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']

        def para_opt_configure(req_grad, no_decay):
            for n, p in param_optimizer:
                if any(nd in n for nd in req_grad):
                    p.requires_grad = True
                else:
                    p.requires_grad = False

            optimizer_grouped_parameters = [{
                'params': [
                    p for n, p in param_optimizer
                    if not any(nd in n for nd in no_decay)
                ],
                'weight_decay':
                0.01
            }, {
                'params': [
                    p for n, p in param_optimizer
                    if any(nd in n for nd in no_decay)
                ],
                'weight_decay':
                0.0
            }]

            return optimizer_grouped_parameters

        optimizer_grouped_parameters = para_opt_configure(req_grad, no_decay)
        train = train_dataset

        num_train_optimization_steps = int(EPOCHS * len(train) / batch_size /
                                           accumulation_steps)

        optimizer = BertAdam(optimizer_grouped_parameters,
                             lr=lr,
                             warmup=0.05,
                             t_total=num_train_optimization_steps)

        subprocess.run(
            'python3 -m pip show apex || ([ -d /kaggle/input/nvidiaapex/repository/NVIDIA-apex-39e153a ] && '
            'pip install -v --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" ../input/nvidiaapex/repository/NVIDIA-apex-39e153a)',
            shell=True,
            check=True)
        from apex import amp  # automatic mix precision
        model, optimizer = amp.initialize(model,
                                          optimizer,
                                          opt_level="O1",
                                          verbosity=1)
        model = model.train()

        tq = tqdm(range(EPOCHS))

        for epoch in tq:
            train_loader = torch.utils.data.DataLoader(train,
                                                       batch_size=batch_size,
                                                       shuffle=True)
            avg_loss = 0.
            avg_accuracy = 0.
            lossf = None
            para_opt_configure(req_grad, no_decay)  # valication will change it
            tk0 = tqdm(enumerate(train_loader),
                       total=len(train_loader),
                       leave=True)
            optimizer.zero_grad()  # Bug fix - thanks to @chinhuic

            for i, (x_batch, y_batch) in tk0:
                #        optimizer.zero_grad()
                y_pred = model(x_batch.to(device),
                               attention_mask=(x_batch > 0).to(device),
                               labels=None)
                loss = F.binary_cross_entropy_with_logits(
                    y_pred, y_batch.to(device))
                with amp.scale_loss(loss, optimizer) as scaled_loss:
                    scaled_loss.backward()

                if (
                        i + 1
                ) % accumulation_steps == 0:  # Wait for several backward steps
                    optimizer.step()  # Now we can do an optimizer step
                    optimizer.zero_grad()

                if lossf:
                    lossf = 0.98 * lossf + 0.02 * loss.item()
                else:
                    lossf = loss.item()
                tk0.set_postfix(loss=lossf)
                avg_loss += loss.item() / len(train_loader)
                avg_accuracy += torch.mean(
                    ((torch.sigmoid(y_pred[:, 0]) > 0.5)
                     == (y_batch[:, 0] > 0.5).to(device)).to(
                         torch.float)).item() / len(train_loader)
            tq.set_postfix(avg_loss=avg_loss, avg_accuracy=avg_accuracy)
            logger.info("AUC for valication: %f",
                        get_validation_result(model, X_val, y_val))

        from datetime import date
        today = date.today()
        torch.save(model.state_dict(), f"{today}_{output_model_file}")