Exemplo n.º 1
0
    def __init__(self,
                 num_train_data,
                 max_updates,
                 max_epochs,
                 device="cuda",
                 fp16_model=False):
        config = get_config_with_defaults({
            "training": {
                "max_updates": max_updates,
                "max_epochs": max_epochs,
                "evaluation_interval": 10000,
                "fp16": True,
            },
            "run_type": "train",
        })
        super().__init__(num_train_data, config=config)
        if fp16_model:
            assert (torch.cuda.is_available()
                    ), "MMFTrainerMock fp16 requires cuda enabled"
            model = SimpleModelWithFp16Assert({"in_dim": 1})
            model.build()
            model = model.cuda()
        else:
            model = SimpleModel({"in_dim": 1})
            model.build()
            model.train()
            model.to(self.device)

        self.model = model
        self.optimizer = torch.optim.SGD(self.model.parameters(), lr=1e-3)
Exemplo n.º 2
0
 def test_eval_loop(self, a, b):
     config = get_config_with_defaults(
         {"training": {"max_updates": 2, "max_epochs": 2}}
     )
     trainer = get_mmf_trainer(config=config)
     combined_report, meter = trainer.evaluation_loop("val")
     self.assertAlmostEqual(combined_report["losses"]["loss"], 493377.5312)
     self.assertAlmostEqual(combined_report["logits"].item(), -0.2379742, 6)
Exemplo n.º 3
0
 def _get_config(self, max_steps, max_epochs):
     config = {
         "trainer": {
             "params": {
                 "max_steps": max_steps,
                 "max_epochs": max_epochs
             }
         }
     }
     return get_config_with_defaults(config)
Exemplo n.º 4
0
    def test_loss_computation_parity_with_mmf_trainer(self):
        # compute mmf_trainer training losses
        def _on_update_end(report, meter, should_log):
            self.mmf_losses.append(report["losses"]["loss"].item())

        config = get_config_with_defaults(
            {"training": {"max_updates": 5, "max_epochs": None}}
        )
        mmf_trainer = get_mmf_trainer(config=config)
        mmf_trainer.on_update_end = _on_update_end
        mmf_trainer.evaluation_loop = MagicMock(return_value=(None, None))
        mmf_trainer.training_loop()

        # compute lightning_trainer training losses
        with patch("mmf.trainers.lightning_trainer.get_mmf_env", return_value=""):
            config = get_config_with_defaults({"trainer": {"params": {"max_steps": 5}}})
            trainer = get_lightning_trainer(config=config)
            trainer.callbacks.append(self)
            trainer.trainer.fit(trainer.model, trainer.data_module.train_loader)
Exemplo n.º 5
0
 def _get_config(self, max_steps, max_epochs, gradient_clip_val):
     config = {
         "trainer": {
             "params": {
                 "max_steps": max_steps,
                 "max_epochs": max_epochs,
                 "gradient_clip_val": gradient_clip_val,
             }
         }
     }
     return get_config_with_defaults(config)
Exemplo n.º 6
0
 def _get_mmf_config(self, max_updates, max_epochs, batch_size,
                     evaluation_interval):
     config = {
         "training": {
             "max_updates": max_updates,
             "max_epochs": max_epochs,
             "batch_size": batch_size,
             "evaluation_interval": evaluation_interval,
         }
     }
     return get_config_with_defaults(config)
Exemplo n.º 7
0
 def _get_mmf_config(
     self, max_updates, max_epochs, max_grad_l2_norm, clip_norm_mode
 ):
     config = {
         "training": {
             "max_updates": max_updates,
             "max_epochs": max_epochs,
             "clip_gradients": True,
             "max_grad_l2_norm": max_grad_l2_norm,
             "clip_norm_mode": clip_norm_mode,
         }
     }
     return get_config_with_defaults(config)
Exemplo n.º 8
0
 def _get_config(self, accumulate_grad_batches, max_steps, batch_size):
     config = {
         "trainer": {
             "params": {
                 "accumulate_grad_batches": accumulate_grad_batches,
                 "max_steps": max_steps,
             }
         },
         "training": {
             "batch_size": batch_size
         },
     }
     return get_config_with_defaults(config)
Exemplo n.º 9
0
 def _get_config(self, max_steps, batch_size, log_every_n_steps,
                 val_check_interval, tensorboard):
     config = {
         "trainer": {
             "params": {
                 "max_steps": max_steps,
                 "log_every_n_steps": log_every_n_steps,
                 "val_check_interval": val_check_interval,
             }
         },
         "training": {
             "batch_size": batch_size,
             "tensorboard": tensorboard
         },
     }
     return get_config_with_defaults(config)
Exemplo n.º 10
0
 def _get_config(
     self,
     max_updates,
     max_epochs,
     batch_size=1,
     update_frequency=1,
     batch_size_per_device=None,
 ):
     config = {
         "training": {
             "max_updates": max_updates,
             "max_epochs": max_epochs,
             "update_frequency": update_frequency,
             "batch_size": batch_size,
             "batch_size_per_device": batch_size_per_device,
         }
     }
     return get_config_with_defaults(config)
Exemplo n.º 11
0
 def _get_mmf_config(
     self,
     max_updates,
     max_epochs,
     batch_size,
     log_interval,
     evaluation_interval,
     tensorboard,
 ):
     config = {
         "training": {
             "batch_size": batch_size,
             "tensorboard": tensorboard,
             "max_updates": max_updates,
             "max_epochs": max_epochs,
             "log_interval": log_interval,
             "evaluation_interval": evaluation_interval,
         }
     }
     return get_config_with_defaults(config)
Exemplo n.º 12
0
    def test_lr_schedule_compared_to_mmf_is_same(self):
        config = get_config_with_defaults(
            {"training": {"max_updates": 8, "max_epochs": None, "lr_scheduler": True}}
        )

        mmf_trainer = get_mmf_trainer(config=config)
        mmf_trainer.lr_scheduler_callback = LRSchedulerCallback(config, mmf_trainer)
        mmf_trainer.callbacks.append(mmf_trainer.lr_scheduler_callback)
        mmf_trainer.on_update_end = mmf_trainer.lr_scheduler_callback.on_update_end
        mmf_trainer.evaluation_loop = MagicMock(return_value=(None, None))
        mmf_trainer.training_loop()

        with patch("mmf.trainers.lightning_trainer.get_mmf_env", return_value=""):
            config = self._get_config(max_steps=8, lr_scheduler=True)
            trainer = get_lightning_trainer(config=config)
            trainer.trainer.fit(trainer.model, trainer.data_module.train_loader)

            mmf_trainer.model.to(trainer.model.device)
            last_model_param1 = list(mmf_trainer.model.parameters())[-1]
            last_model_param2 = list(trainer.model.parameters())[-1]
            self.assertTrue(torch.allclose(last_model_param1, last_model_param2))
Exemplo n.º 13
0
    def __init__(self,
                 config,
                 num_train_data,
                 max_updates,
                 max_epochs,
                 device="cuda"):
        config.training.max_updates = max_updates
        config.training.max_epochs = max_epochs
        config.training.fp16 = True
        config = get_config_with_defaults(config)

        super().__init__(num_train_data, config=config)
        self.device = torch.device(device)
        self.config = config
        self.model = SimpleModel({"in_dim": 1})
        self.model.build()
        self.model = self.model.cuda()
        self.optimizer = build_optimizer(self.model, self.config)
        self.distributed = True
        self.local_rank = 0
        self.parallelize_model()
        self.load_fp16_scaler()
Exemplo n.º 14
0
 def _get_config(
     self,
     max_steps,
     batch_size,
     val_check_interval,
     log_every_n_steps,
     limit_val_batches,
 ):
     config = {
         "trainer": {
             "params": {
                 "max_steps": max_steps,
                 "log_every_n_steps": log_every_n_steps,
                 "val_check_interval": val_check_interval,
                 "limit_val_batches": limit_val_batches,
             }
         },
         "training": {
             "batch_size": batch_size
         },
     }
     return get_config_with_defaults(config)
Exemplo n.º 15
0
 def _get_config(self, max_steps, lr_scheduler=False):
     config = {
         "trainer": {"params": {"max_steps": max_steps}},
         "training": {"lr_scheduler": lr_scheduler},
     }
     return get_config_with_defaults(config)
Exemplo n.º 16
0
    def _get_ckpt_config(self,
                         is_pl=False,
                         ckpt_config=None,
                         max_steps=6,
                         resume_from_checkpoint=None):
        if ckpt_config is None:
            ckpt_config = {}

        if not is_pl:
            return get_config_with_defaults({
                "training": {
                    "max_updates": max_steps,
                    "max_epochs": None,
                    "early_stop": {
                        "enabled": True,
                        "criteria": "numbers/accuracy",
                        "minimize": False,
                    },
                    "checkpoint_interval": 2,
                    "evaluation_interval": 2,
                },
                "model": "simple_model",
                "evaluation": {
                    "metrics": ["accuracy"]
                },
                "checkpoint": {
                    "max_to_keep": 1,
                    "save_git_details": False,
                    **ckpt_config,
                },
                "run_type": "train_val",
            })
        else:
            return get_config_with_defaults({
                "training": {
                    "checkpoint_interval": 2,
                    "early_stop": {
                        "enabled": True,
                        "criteria": "numbers/accuracy",
                        "minimize": False,
                    },
                },
                "trainer": {
                    "params": {
                        "max_steps": max_steps,
                        "max_epochs": None,
                        "checkpoint_callback": True,
                        "resume_from_checkpoint": resume_from_checkpoint,
                        "val_check_interval": 2,
                    }
                },
                "model": "simple_lightning_model",
                "evaluation": {
                    "metrics": ["accuracy"]
                },
                "checkpoint": {
                    "max_to_keep": 1,
                    "save_git_details": False,
                    **ckpt_config,
                },
                "run_type": "train_val",
            })