Пример #1
0
def run():
    models = {
        'resnet34': mod.resnet34,
        'resnet50': mod.resnet50,
        'resnet101': mod.resnet101,
        'resnet152': mod.resnet152
    }

    db = load_data_classif(cfg.LABELS,
                           bs=8 * cfg.BATCH_SIZE,
                           train_size=cfg.TRAIN_SIZE)

    learner = cnn_learner(db,
                          models[cfg.MODEL],
                          pretrained=cfg.PRETRAINED,
                          wd=cfg.WD,
                          model_dir=cfg.MODELS_PATH,
                          metrics=[accuracy])

    save_name = f'clf_{cfg.MODEL}'
    save_name = f'{save_name}_{getNextFilePath(cfg.MODELS_PATH, save_name)}'

    learner = learner.clip_grad(1.)
    set_BN_momentum(learner.model)

    learner.fit_one_cycle(cfg.EPOCHS,
                          slice(cfg.LR),
                          callbacks=[
                              SaveModelCallback(learner,
                                                monitor='valid_loss',
                                                name=save_name),
                              AccumulateStep(learner, 64 // cfg.BATCH_SIZE),
                              LearnerTensorboardWriter(learner,
                                                       cfg.LOG,
                                                       save_name,
                                                       loss_iters=10,
                                                       hist_iters=100,
                                                       stats_iters=10)
                          ])

    learner.unfreeze()
    uf_save_name = 'uf_' + save_name

    learner.fit_one_cycle(cfg.EPOCHS,
                          slice(cfg.LR / 10),
                          callbacks=[
                              SaveModelCallback(learner,
                                                monitor='valid_loss',
                                                name=uf_save_name),
                              AccumulateStep(learner, 64 // cfg.BATCH_SIZE),
                              LearnerTensorboardWriter(learner,
                                                       cfg.LOG,
                                                       uf_save_name,
                                                       loss_iters=10,
                                                       hist_iters=100,
                                                       stats_iters=10)
                          ])
Пример #2
0
def run():
    models = {
        'resnet34': mod.resnet34,
        'resnet50': mod.resnet50,
        'resnet101': mod.resnet101,
        'resnet152': mod.resnet152
    }

    save_name = f'mtl_{cfg.MODEL}_{cfg.TRAIN_SIZE}'
    save_name = f'{save_name}_{getNextFilePath(cfg.MODELS_PATH, save_name)}'

    test_list = (MultiTaskList.from_folder(cfg.TEST_PATH, extensions=['.dcm']))
    best = 0

    pred_path = cfg.PRED_PATH / save_name
    if not pred_path.is_dir():
        pred_path.mkdir()

    project = neptune.init('schwobr/SIIM-Pneumothorax')

    for k, db in enumerate(
            load_data_kfold_mtl(cfg.LABELS,
                                bs=cfg.BATCH_SIZE,
                                train_size=cfg.TRAIN_SIZE,
                                xtra_tfms=[gaussian_noise()])):
        print(f'fold {k}')

        learner = multi_task_unet_learner(
            db,
            models[cfg.MODEL],
            log_vars=torch.tensor(cfg.LOG_VARS),
            pretrained=cfg.PRETRAINED,
            loss_func=MTLLoss(CrossEntropyFlat(), CrossEntropyFlat(axis=1)),
            wd=cfg.WD,
            model_dir=cfg.MODELS_PATH,
            opt_func=RangerW,
            metrics=[
                mtl_metric(dice, dim=1),
                mtl_metric(accuracy, dim=0),
                average_mtl_metric([dice, accuracy], [1, 0])
            ])

        fold_name = f'fold{k}_' + save_name
        set_BN_momentum(learner.model)

        learner.fit_one_cycle(cfg.EPOCHS,
                              slice(cfg.LR),
                              callbacks=[
                                  SaveModelCallback(learner,
                                                    monitor='dice_accuracy',
                                                    name=fold_name),
                                  MTLLossCallback(learner),
                                  AccumulateStep(learner,
                                                 64 // cfg.BATCH_SIZE),
                                  NeptuneCallback(learner,
                                                  project,
                                                  name=fold_name,
                                                  params={
                                                      'lr': cfg.LR,
                                                      'wd': cfg.WD,
                                                      'size': cfg.TRAIN_SIZE
                                                  }),
                                  LearnerTensorboardWriter(learner,
                                                           cfg.LOG,
                                                           fold_name,
                                                           loss_iters=10,
                                                           hist_iters=50,
                                                           stats_iters=10)
                              ])

        met = max([met[0] for met in learner.recorder.metrics])

        if met > best:
            learner.save(save_name)
            best = met
            print(f'New best fold {k} with dice {best}')

        # learner.neptune_callback.send_artifact(
        #    cfg.MODELS_PATH/(fold_name+'.pth'))
        learner.neptune_callback.stop()

        learner.unfreeze()
        fold_name = 'uf_' + fold_name

        learner.fit_one_cycle(cfg.UNFROZE_EPOCHS,
                              slice(cfg.LR / 500, cfg.LR / 10),
                              callbacks=[
                                  SaveModelCallback(learner,
                                                    monitor='dice_accuracy',
                                                    name=fold_name),
                                  MTLLossCallback(learner),
                                  AccumulateStep(learner,
                                                 64 // cfg.BATCH_SIZE),
                                  NeptuneCallback(learner,
                                                  project,
                                                  name=fold_name,
                                                  params={
                                                      'lr': cfg.LR,
                                                      'wd': cfg.WD,
                                                      'size': cfg.TRAIN_SIZE
                                                  }),
                                  LearnerTensorboardWriter(learner,
                                                           cfg.LOG,
                                                           fold_name,
                                                           loss_iters=10,
                                                           hist_iters=50,
                                                           stats_iters=10)
                              ])

        met = max([met[0] for met in learner.recorder.metrics])

        if met > best:
            learner.save(save_name)
            best = met
            print(f'New best fold {k} with dice {best}')

        # learner.neptune_callback.send_artifact(
        #    cfg.MODELS_PATH/(fold_name+'.pth'))
        learner.neptune_callback.stop()

        learner.data.add_test(test_list,
                              label=[test_list.items[0], '-1'],
                              tfms=(),
                              tfm_y=True)

        save_preds_mtl(learner, pred_path / str(k))

    exp = project.create_experiment(name=save_name,
                                    description='k-fold mtl training',
                                    params={
                                        'lr': cfg.LR,
                                        'wd': cfg.WD,
                                        'size': cfg.TRAIN_SIZE
                                    })

    # exp.send_artifact(cfg.MODELS_PATH/(save_name+'.pth'))

    learner.load(save_name)
    learner.data.add_test(test_list,
                          label=[test_list.items[0], '-1'],
                          tfms=(),
                          tfm_y=True)

    thr, thr_clf = get_best_thrs_mtl(learner,
                                     plot=False,
                                     a=0.,
                                     test_size=cfg.TEST_SIZE,
                                     exp=None,
                                     fig_path=cfg.FIG_PATH /
                                     (save_name + '.png'))

    create_submission_kfold_mtl(learner,
                                cfg.SUB_PATH / (save_name + '.csv'),
                                pred_path,
                                test_size=cfg.TEST_SIZE,
                                thr=thr,
                                clf_thr=0.)

    exp.send_artifact(cfg.SUB_PATH / (save_name + '.csv'))
    exp.stop()
Пример #3
0
    def fit(self,
            epochs=10,
            lr=None,
            one_cycle=True,
            early_stopping=False,
            checkpoint=True,
            tensorboard=False,
            **kwargs):
        """
        Train the model for the specified number of epochs and using the
        specified learning rates
        
        =====================   ===========================================
        **Argument**            **Description**
        ---------------------   -------------------------------------------
        epochs                  Required integer. Number of cycles of training
                                on the data. Increase it if underfitting.
        ---------------------   -------------------------------------------
        lr                      Optional float or slice of floats. Learning rate
                                to be used for training the model. If ``lr=None``, 
                                an optimal learning rate is automatically deduced 
                                for training the model.
        ---------------------   -------------------------------------------
        one_cycle               Optional boolean. Parameter to select 1cycle
                                learning rate schedule. If set to `False` no 
                                learning rate schedule is used.       
        ---------------------   -------------------------------------------
        early_stopping          Optional boolean. Parameter to add early stopping.
                                If set to 'True' training will stop if validation
                                loss stops improving for 5 epochs.        
        ---------------------   -------------------------------------------
        checkpoint              Optional boolean. Parameter to save the best model
                                during training. If set to `True` the best model 
                                based on validation loss will be saved during 
                                training.
        ---------------------   -------------------------------------------
        tensorboard             Optional boolean. Parameter to write the training log. 
                                If set to 'True' the log will be saved at 
                                <dataset-path>/training_log which can be visualized in
                                tensorboard. Required tensorboardx version=1.7 (Experimental support).

                                The default value is 'False'.
        =====================   ===========================================
        """
        self._check_requisites()

        if lr is None:
            print('Finding optimum learning rate.')

            lr = self.lr_find(allow_plot=False)
            lr = slice(lr / 10, lr)

        self._learning_rate = lr

        if arcgis.env.verbose:
            logger.info('Fitting the model.')

        if getattr(self, '_backend', 'pytorch') == 'tensorflow':
            checkpoint = False

        callbacks = kwargs['callbacks'] if 'callbacks' in kwargs.keys() else []
        kwargs.pop('callbacks', None)
        if early_stopping:
            callbacks.append(
                EarlyStoppingCallback(learn=self.learn,
                                      monitor='valid_loss',
                                      min_delta=0.01,
                                      patience=5))
        if checkpoint:
            from datetime import datetime
            now = datetime.now()
            callbacks.append(
                SaveModelCallback(
                    self,
                    monitor='valid_loss',
                    every='improvement',
                    name=now.strftime("checkpoint_%Y-%m-%d_%H-%M-%S")))

        # If tensorboardx is installed write a log with name as timestamp
        if tensorboard and HAS_TENSORBOARDX:
            training_id = time.strftime("log_%Y-%m-%d_%H-%M-%S")
            log_path = Path(os.path.dirname(self._data.path)) / 'training_log'
            callbacks.append(
                LearnerTensorboardWriter(learn=self.learn,
                                         base_dir=log_path,
                                         name=training_id))
            hostname = socket.gethostname()
            print(
                "Monitor training using Tensorboard using the following command: 'tensorboard --host={} --logdir={}'"
                .format(hostname, log_path))
        # Send out a warning if tensorboardX is not installed
        elif tensorboard:
            warn(
                "Install tensorboardX 1.7 'pip install tensorboardx==1.7' to write training log"
            )

        if one_cycle:
            self.learn.fit_one_cycle(epochs, lr, callbacks=callbacks, **kwargs)
        else:
            self.learn.fit(epochs, lr, callbacks=callbacks, **kwargs)
Пример #4
0
def train(train_dataset: torch.utils.data.Dataset,
          test_dataset: torch.utils.data.Dataset,
          training_config: dict = train_config,
          global_config: dict = global_config):
    """
    Template training routine. Takes a training and a test dataset wrapped
    as torch.utils.data.Dataset type and two corresponding generic
    configs for both gobal path settings and training settings.
    Returns the fitted fastai.train.Learner object which can be
    used to assess the resulting metrics and error curves etc.
    """

    for path in global_config.values():
        create_dirs(path)

    # wrap datasets with Dataloader classes
    train_loader = torch.utils.data.DataLoader(
        train_dataset, **train_config["DATA_LOADER_CONFIG"])
    test_loader = torch.utils.data.DataLoader(
        test_dataset, **train_config["DATA_LOADER_CONFIG"])
    databunch = DataBunch(train_loader, test_loader)

    # instantiate model and learner
    if training_config["WEIGHTS"] is None:
        model = training_config["MODEL"](**training_config["MODEL_CONFIG"])
    else:
        model = load_model(training_config["MODEL"],
                           training_config["MODEL_CONFIG"],
                           training_config["WEIGHTS"],
                           training_config["DEVICE"])

    learner = Learner(databunch,
                      model,
                      metrics=train_config["METRICS"],
                      path=global_config["ROOT_PATH"],
                      model_dir=global_config["WEIGHT_DIR"],
                      loss_func=train_config["LOSS"])

    # model name & paths
    name = "_".join([train_config["DATE"], train_config["SESSION_NAME"]])
    modelpath = os.path.join(global_config["WEIGHT_DIR"], name)

    if train_config["MIXED_PRECISION"]:
        learner.to_fp16()

    learner.save(modelpath)

    torch.backends.cudnn.benchmark = True

    cbs = [
        SaveModelCallback(learner),
        LearnerTensorboardWriter(
            learner,
            Path(os.path.join(global_config["LOG_DIR"]), "tensorboardx"),
            name),
        TerminateOnNaNCallback()
    ]

    # perform training iteration
    try:
        if train_config["ONE_CYCLE"]:
            learner.fit_one_cycle(train_config["EPOCHS"],
                                  max_lr=train_config["LR"],
                                  callbacks=cbs)
        else:
            learner.fit(train_config["EPOCHS"],
                        lr=train_config["LR"],
                        callbacks=cbs)
    # save model files
    except KeyboardInterrupt:
        learner.save(modelpath)
        raise KeyboardInterrupt

    learner.save(modelpath)
    val_loss = min(learner.recorder.val_losses)
    val_metrics = learner.recorder.metrics

    # log using the logging tool
    logger = log.Log(train_config, run_name=train_config['SESSION_NAME'])
    logger.log_metric('Validation Loss', val_loss)
    logger.log.metrics(val_metrics)
    logger.end_run()

    #write csv log file
    log_content = train_config.copy()
    log_content["VAL_LOSS"] = val_loss
    log_content["VAL_METRICS"] = val_metrics
    log_path = os.path.join(global_config["LOG_DIR"], train_config["LOGFILE"])
    write_log(log_path, log_content)

    return learner, log_content, name