예제 #1
0
    def test_base_ops(self):
        fsm = FileStructManager(base_dir=self.base_dir, is_continue=False)
        model = SimpleModel()

        trainer = Trainer(TrainConfig(model, [], torch.nn.L1Loss(), torch.optim.SGD(model.parameters(), lr=1)),
                          fsm)
        with self.assertRaises(Trainer.TrainerException):
            trainer.train()
예제 #2
0
    def test_lr_decaying(self):
        fsm = FileStructManager(base_dir=self.base_dir, is_continue=False)
        model = SimpleModel()
        metrics_processor = MetricsProcessor()
        stages = [
            TrainStage(
                TestDataProducer([[{
                    'data': torch.rand(1, 3),
                    'target': torch.rand(1)
                } for _ in list(range(20))]]), metrics_processor),
            ValidationStage(
                TestDataProducer([[{
                    'data': torch.rand(1, 3),
                    'target': torch.rand(1)
                } for _ in list(range(20))]]), metrics_processor)
        ]
        trainer = Trainer(
            model,
            TrainConfig(stages, SimpleLoss(),
                        torch.optim.SGD(model.parameters(), lr=0.1)),
            fsm).set_epoch_num(10)

        def target_value_clbk() -> float:
            return 1

        trainer.enable_lr_decaying(0.5, 3, target_value_clbk)
        trainer.train()

        self.assertAlmostEqual(trainer.data_processor().get_lr(),
                               0.1 * (0.5**3),
                               delta=1e-6)
예제 #3
0
 def test_train(self):
     fsm = FileStructManager(base_dir=self.base_dir, is_continue=False)
     model = SimpleModel()
     metrics_processor = MetricsProcessor()
     stages = [TrainStage(TestDataProducer([[{'data': torch.rand(1, 3), 'target': torch.rand(1)}
                                             for _ in list(range(20))]]), metrics_processor),
               ValidationStage(TestDataProducer([[{'data': torch.rand(1, 3), 'target': torch.rand(1)}
                                                  for _ in list(range(20))]]), metrics_processor)]
     Trainer(TrainConfig(model, stages, SimpleLoss(), torch.optim.SGD(model.parameters(), lr=1)), fsm) \
         .set_epoch_num(1).train()
예제 #4
0
    def test_train_stage(self):
        data_producer = DataProducer([[{'data': torch.rand(1, 3), 'target': torch.rand(1)} for _ in list(range(20))]])
        metrics_processor = FakeMetricsProcessor()
        train_stage = TrainStage(data_producer, metrics_processor).enable_hard_negative_mining(0.1)

        fsm = FileStructManager(base_dir=self.base_dir, is_continue=False)
        model = SimpleModel()
        Trainer(TrainConfig(model, [train_stage], SimpleLoss(), torch.optim.SGD(model.parameters(), lr=1)), fsm) \
            .set_epoch_num(1).train()

        self.assertEqual(metrics_processor.call_num, len(data_producer))
예제 #5
0
    def test_savig_best_states(self):
        fsm = FileStructManager(base_dir=self.base_dir, is_continue=False)
        model = SimpleModel()
        metrics_processor = MetricsProcessor()
        stages = [TrainStage(TestDataProducer([[{'data': torch.rand(1, 3), 'target': torch.rand(1)}
                                                for _ in list(range(20))]]), metrics_processor)]
        trainer = Trainer(TrainConfig(model, stages, SimpleLoss(), torch.optim.SGD(model.parameters(), lr=0.1)),
                          fsm).set_epoch_num(3).enable_best_states_saving(lambda: np.mean(stages[0].get_losses()))

        checkpoint_file = os.path.join(self.base_dir, 'checkpoints', 'last', 'last_checkpoint.zip')
        best_checkpoint_file = os.path.join(self.base_dir, 'checkpoints', 'best', 'best_checkpoint.zip')

        class Val:
            def __init__(self):
                self.v = None

        first_val = Val()

        def on_epoch_end(val):
            if val.v is not None and np.mean(stages[0].get_losses()) < val.v:
                self.assertTrue(os.path.exists(best_checkpoint_file))
                os.remove(best_checkpoint_file)
                val.v = np.mean(stages[0].get_losses())
                return

            val.v = np.mean(stages[0].get_losses())

            self.assertTrue(os.path.exists(checkpoint_file))
            self.assertFalse(os.path.exists(best_checkpoint_file))
            os.remove(checkpoint_file)

        trainer.add_on_epoch_end_callback(lambda: on_epoch_end(first_val))
        trainer.train()
예제 #6
0
    def test_savig_states(self):
        fsm = FileStructManager(base_dir=self.base_dir, is_continue=False)
        model = SimpleModel()
        metrics_processor = MetricsProcessor()
        stages = [
            TrainStage(
                TestDataProducer([[{
                    'data': torch.rand(1, 3),
                    'target': torch.rand(1)
                } for _ in list(range(20))]]), metrics_processor)
        ]
        trainer = Trainer(
            model,
            TrainConfig(stages, SimpleLoss(),
                        torch.optim.SGD(model.parameters(), lr=0.1)),
            fsm).set_epoch_num(3)

        checkpoint_file = os.path.join(self.base_dir, 'checkpoints', 'last',
                                       'last_checkpoint.zip')

        def on_epoch_end():
            self.assertTrue(os.path.exists(checkpoint_file))
            os.remove(checkpoint_file)

        trainer.add_on_epoch_end_callback(on_epoch_end)
        trainer.train()
예제 #7
0
def train(num_epochs=5):
    fsm = FileStructManager(base_dir="models/UoI/", is_continue=False)
    model = HumanBBox()

    train_dataset = DataProducer(
        [
            BBoxDataset(
                "coco/train2017_one_human.csv", size=SZ, type="train", fastai_out=False
            )
        ],
        batch_size=8,
        num_workers=5,
    )
    validation_dataset = DataProducer(
        [
            BBoxDataset(
                "coco/val2017_one_human_train.csv",
                size=SZ,
                type="val",
                fastai_out=False,
            )
        ],
        batch_size=4,
        num_workers=2,
    )

    train_config = TrainConfig(
        [TrainStage(train_dataset), ValidationStage(validation_dataset)],
        # torch.nn.L1Loss(),
        IoU,
        torch.optim.SGD(model.parameters(), lr=5e-3, momentum=0.8),
    )

    trainer = (
        Trainer(model, train_config, fsm, torch.device("cuda:0")).set_epoch_num(
            num_epochs
        )
        # .enable_lr_decaying(0.97, 1000)
    )
    trainer.monitor_hub.add_monitor(TensorboardMonitor(fsm, is_continue=True))
    # .add_monitor(LogMonitor(fsm)
    # .resume(from_best_checkpoint=False)
    trainer.train()
예제 #8
0
def train(config_type: BaseTrainConfig):
    fsm = FileStructManager(base_dir=config_type.experiment_dir,
                            is_continue=False)

    config = config_type({'train': ['train.npy'], 'val': 'val.npy'})

    trainer = Trainer(config, fsm, device=torch.device('cuda'))
    tensorboard = TensorboardMonitor(fsm, is_continue=False)
    trainer.monitor_hub.add_monitor(tensorboard)

    trainer.set_epoch_num(300)
    trainer.enable_lr_decaying(
        coeff=0.5,
        patience=10,
        target_val_clbk=lambda: np.mean(config.val_stage.get_losses()))
    trainer.add_on_epoch_end_callback(
        lambda: tensorboard.update_scalar('params/lr',
                                          trainer.data_processor().get_lr()))
    trainer.enable_best_states_saving(
        lambda: np.mean(config.val_stage.get_losses()))
    trainer.add_stop_rule(lambda: trainer.data_processor().get_lr() < 1e-6)

    trainer.train()
예제 #9
0
    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, item):
        data, target = self.dataset[item]
        return {'data': self.transforms(data), 'target': target}


if __name__ == '__main__':
    fsm = FileStructManager(base_dir='data', is_continue=False)
    model = Net()

    train_dataset = DataProducer([MNISTDataset('data/dataset', True)],
                                 batch_size=4,
                                 num_workers=2)
    validation_dataset = DataProducer([MNISTDataset('data/dataset', False)],
                                      batch_size=4,
                                      num_workers=2)

    train_config = TrainConfig(
        model,
        [TrainStage(train_dataset),
         ValidationStage(validation_dataset)], torch.nn.NLLLoss(),
        torch.optim.SGD(model.parameters(), lr=1e-4, momentum=0.5))

    trainer = Trainer(train_config, fsm,
                      torch.device('cuda:0')).set_epoch_num(5)
    trainer.monitor_hub.add_monitor(TensorboardMonitor(fsm, is_continue=False))
    trainer.train()
예제 #10
0
def train():
    train_config = PoseNetTrainConfig()

    file_struct_manager = FileStructManager(
        base_dir=PoseNetTrainConfig.experiment_dir, is_continue=False)

    trainer = Trainer(train_config, file_struct_manager, torch.device('cuda'))
    trainer.set_epoch_num(EPOCH_NUM)

    tensorboard = TensorboardMonitor(file_struct_manager, is_continue=False)
    log = LogMonitor(file_struct_manager).write_final_metrics()
    trainer.monitor_hub.add_monitor(tensorboard).add_monitor(log)
    trainer.enable_best_states_saving(
        lambda: np.mean(train_config.val_stage.get_losses()))

    trainer.enable_lr_decaying(
        coeff=0.5,
        patience=10,
        target_val_clbk=lambda: np.mean(train_config.val_stage.get_losses()))
    trainer.add_on_epoch_end_callback(
        lambda: tensorboard.update_scalar('params/lr',
                                          trainer.data_processor().get_lr()))
    trainer.train()
예제 #11
0
def train():
    model = resnet18(classes_num=1, in_channels=3, pretrained=True)
    train_config = TrainConfig(model, [train_stage, val_stage], torch.nn.BCEWithLogitsLoss(),
                               torch.optim.Adam(model.parameters(), lr=1e-4))

    file_struct_manager = FileStructManager(base_dir='data', is_continue=False)

    trainer = Trainer(train_config, file_struct_manager, torch.device('cuda:0')).set_epoch_num(2)

    tensorboard = TensorboardMonitor(file_struct_manager, is_continue=False, network_name='PortraitSegmentation')
    log = LogMonitor(file_struct_manager).write_final_metrics()
    trainer.monitor_hub.add_monitor(tensorboard).add_monitor(log)
    trainer.enable_best_states_saving(lambda: np.mean(train_stage.get_losses()))

    trainer.enable_lr_decaying(coeff=0.5, patience=10, target_val_clbk=lambda: np.mean(train_stage.get_losses()))
    trainer.add_on_epoch_end_callback(lambda: tensorboard.update_scalar('params/lr', trainer.data_processor().get_lr()))
    trainer.train()
예제 #12
0
def continue_training():
    ########################################################
    # Create needed parameters again
    ########################################################

    model = resnet18(classes_num=1, in_channels=3, pretrained=True)
    train_config = TrainConfig([train_stage, val_stage],
                               torch.nn.BCEWithLogitsLoss(),
                               torch.optim.Adam(model.parameters(), lr=1e-4))

    ########################################################
    # If FileStructManager creates again - just 'set is_continue' parameter to True
    ########################################################
    file_struct_manager = FileStructManager(base_dir='data', is_continue=True)

    trainer = Trainer(model, train_config, file_struct_manager,
                      torch.device('cuda:0')).set_epoch_num(10)

    tensorboard = TensorboardMonitor(file_struct_manager,
                                     is_continue=False,
                                     network_name='PortraitSegmentation')
    log = LogMonitor(file_struct_manager).write_final_metrics()
    trainer.monitor_hub.add_monitor(tensorboard).add_monitor(log)
    trainer.enable_best_states_saving(
        lambda: np.mean(train_stage.get_losses()))

    trainer.enable_lr_decaying(
        coeff=0.5,
        patience=10,
        target_val_clbk=lambda: np.mean(train_stage.get_losses()))
    trainer.add_on_epoch_end_callback(
        lambda: tensorboard.update_scalar('params/lr',
                                          trainer.data_processor().get_lr()))

    ########################################################
    # For set resume mode to Trainer just call 'resume' method
    ########################################################

    trainer.resume(from_best_checkpoint=False).train()