def main(cfg):
    OmegaConf.set_struct(cfg, False)

    # Get device
    device = torch.device("cuda" if (torch.cuda.is_available() and cfg.training.cuda) else "cpu")
    log.info("DEVICE : {}".format(device))

    # Enable CUDNN BACKEND
    torch.backends.cudnn.enabled = cfg.training.enable_cudnn

    # Checkpoint
    checkpoint = ModelCheckpoint(cfg.training.checkpoint_dir, cfg.model_name, cfg.training.weight_name, strict=True)

    # Setup the dataset config
    # Generic config

    dataset = instantiate_dataset(cfg.data)
    model = checkpoint.create_model(dataset, weight_name=cfg.training.weight_name)
    log.info(model)
    log.info("Model size = %i", sum(param.numel() for param in model.parameters() if param.requires_grad))

    log.info(dataset)

    model.eval()
    if cfg.enable_dropout:
        model.enable_dropout_in_eval()
    model = model.to(device)

    # Run training / evaluation
    output_path = os.path.join(cfg.training.checkpoint_dir, cfg.data.name, "features")
    if not os.path.exists(output_path):
        os.makedirs(output_path, exist_ok=True)

    run(model, dataset, device, output_path, cfg)
示例#2
0
def main(cfg):
    OmegaConf.set_struct(cfg, False)

    # Get device
    device = torch.device("cuda" if (torch.cuda.is_available() and cfg.training.cuda) else "cpu")
    log.info("DEVICE : {}".format(device))

    # Enable CUDNN BACKEND
    torch.backends.cudnn.enabled = cfg.training.enable_cudnn

    # Checkpoint
    checkpoint = ModelCheckpoint(cfg.training.checkpoint_dir, cfg.model_name, cfg.training.weight_name, strict=True)

    # Setup the dataset config
    # Generic config

    dataset = instantiate_dataset(cfg.data)
    if not checkpoint.is_empty:
        model = checkpoint.create_model(dataset, weight_name=cfg.training.weight_name)
    else:
        log.info("No Checkpoint for this model")
        model = instantiate_model(copy.deepcopy(cfg), dataset)
        model.set_pretrained_weights()
    log.info(model)
    log.info("Model size = %i", sum(param.numel() for param in model.parameters() if param.requires_grad))

    log.info(dataset)

    model.eval()
    if cfg.enable_dropout:
        model.enable_dropout_in_eval()
    model = model.to(device)

    run(model, dataset, device, cfg)
示例#3
0
    def test_best_metric(self):
        self.run_path = os.path.join(DIR, "checkpt")
        if not os.path.exists(self.run_path):
            os.makedirs(self.run_path)

        model_checkpoint = ModelCheckpoint(self.run_path,
                                           self.model_name,
                                           "test",
                                           run_config=self.config,
                                           resume=False)
        model = MockModel()
        optimal_state = model.state.item()
        metric_func = {"acc": max}
        mock_metrics = {
            "current_metrics": {
                "acc": 12
            },
            "stage": "test",
            "epoch": 10
        }
        model_checkpoint.save_best_models_under_current_metrics(
            model, mock_metrics, metric_func)
        model.state[0] = 2
        mock_metrics = {
            "current_metrics": {
                "acc": 0
            },
            "stage": "test",
            "epoch": 11
        }
        model_checkpoint.save_best_models_under_current_metrics(
            model, mock_metrics, metric_func)
        mock_metrics = {
            "current_metrics": {
                "acc": 10
            },
            "stage": "train",
            "epoch": 11
        }
        model_checkpoint.save_best_models_under_current_metrics(
            model, mock_metrics, metric_func)
        mock_metrics = {
            "current_metrics": {
                "acc": 15
            },
            "stage": "train",
            "epoch": 11
        }
        model_checkpoint.save_best_models_under_current_metrics(
            model, mock_metrics, metric_func)
        self.assertEqual(model_checkpoint.checkpoint_path,
                         os.path.join(self.run_path, self.model_name + ".pt"))

        ckp = torch.load(model_checkpoint.checkpoint_path)

        self.assertEqual(ckp["models"]["best_acc"]["state"].item(),
                         optimal_state)
        self.assertEqual(ckp["models"]["latest"]["state"].item(),
                         model.state.item())
示例#4
0
def main(cfg):
    OmegaConf.set_struct(cfg, False)

    # Get device
    device = torch.device("cuda" if (
        torch.cuda.is_available() and cfg.cuda) else "cpu")
    log.info("DEVICE : {}".format(device))

    # Enable CUDNN BACKEND
    torch.backends.cudnn.enabled = cfg.enable_cudnn

    # Checkpoint
    checkpoint = ModelCheckpoint(cfg.checkpoint_dir,
                                 cfg.model_name,
                                 cfg.weight_name,
                                 strict=True)

    # Setup the dataset config
    # Generic config
    train_dataset_cls = get_dataset_class(checkpoint.data_config)
    setattr(checkpoint.data_config, "class", train_dataset_cls.FORWARD_CLASS)
    setattr(checkpoint.data_config, "dataroot", cfg.input_path)

    # Datset specific configs
    if cfg.data:
        for key, value in cfg.data.items():
            checkpoint.data_config.update(key, value)

    # Create dataset and mdoel
    dataset = instantiate_dataset(checkpoint.data_config)
    model = checkpoint.create_model(dataset, weight_name=cfg.weight_name)
    log.info(model)
    log.info(
        "Model size = %i",
        sum(param.numel() for param in model.parameters()
            if param.requires_grad))

    # Set dataloaders
    dataset.create_dataloaders(
        model,
        cfg.batch_size,
        cfg.shuffle,
        cfg.num_workers,
        False,
    )
    log.info(dataset)

    model.eval()
    if cfg.enable_dropout:
        model.enable_dropout_in_eval()
    model = model.to(device)

    # Run training / evaluation
    if not os.path.exists(cfg.output_path):
        os.makedirs(cfg.output_path)

    run(model, dataset, device, cfg.output_path)
示例#5
0
def main(cfg):
    OmegaConf.set_struct(cfg, False)

    # Get device
    device = torch.device("cuda" if (
        torch.cuda.is_available() and cfg.cuda) else "cpu")
    log.info("DEVICE : {}".format(device))

    # Enable CUDNN BACKEND
    torch.backends.cudnn.enabled = cfg.enable_cudnn

    # Checkpoint
    checkpoint = ModelCheckpoint(cfg.checkpoint_dir,
                                 cfg.model_name,
                                 cfg.weight_name,
                                 strict=True)

    # Create model and datasets
    dataset = instantiate_dataset(checkpoint.data_config)
    model = checkpoint.create_model(dataset, weight_name=cfg.weight_name)
    log.info(model)
    log.info(
        "Model size = %i",
        sum(param.numel() for param in model.parameters()
            if param.requires_grad))

    # Set dataloaders
    dataset.create_dataloaders(
        model,
        cfg.batch_size,
        cfg.shuffle,
        cfg.num_workers,
        cfg.precompute_multi_scale,
    )
    log.info(dataset)

    model.eval()
    if cfg.enable_dropout:
        model.enable_dropout_in_eval()
    model = model.to(device)

    tracker: BaseTracker = dataset.get_tracker(model, dataset, False, False)

    # Run training / evaluation
    run(
        cfg,
        model,
        dataset,
        device,
        tracker,
        checkpoint,
        voting_runs=cfg.voting_runs,
        tracker_options=cfg.tracker_options,
    )
示例#6
0
def train_epoch(
    epoch: int,
    model: BaseModel,
    dataset,
    device: str,
    tracker: BaseTracker,
    checkpoint: ModelCheckpoint,
    visualizer: Visualizer,
    debugging,
):

    early_break = getattr(debugging, "early_break", False)
    profiling = getattr(debugging, "profiling", False)

    model.train()
    tracker.reset("train")
    visualizer.reset(epoch, "train")
    train_loader = dataset.train_dataloader

    iter_data_time = time.time()
    with Ctq(train_loader) as tq_train_loader:
        for i, data in enumerate(tq_train_loader):
            model.set_input(data, device)
            t_data = time.time() - iter_data_time

            iter_start_time = time.time()
            model.optimize_parameters(epoch, dataset.batch_size)
            if i % 10 == 0:
                tracker.track(model)

            tq_train_loader.set_postfix(**tracker.get_metrics(),
                                        data_loading=float(t_data),
                                        iteration=float(time.time() -
                                                        iter_start_time),
                                        color=COLORS.TRAIN_COLOR)

            if visualizer.is_active:
                visualizer.save_visuals(model.get_current_visuals())

            iter_data_time = time.time()

            if early_break:
                break

            if profiling:
                if i > getattr(debugging, "num_batches", 50):
                    return 0

    metrics = tracker.publish(epoch)
    checkpoint.save_best_models_under_current_metrics(model, metrics,
                                                      tracker.metric_func)
    log.info("Learning rate = %f" % model.learning_rate)
    def test_model_ckpt_using_pointnet2ms(self,):
        # Create a checkpt

        self.run_path = os.path.join(DIR, "checkpt")
        if not os.path.exists(self.run_path):
            os.makedirs(self.run_path)

        model_checkpoint = ModelCheckpoint(self.run_path, self.model_name, "test", run_config=self.config, resume=False)
        dataset = MockDatasetGeometric(5)
        model = instantiate_model(self.config, dataset)
        model.set_input(dataset[0], "cpu")
        model.instantiate_optimizers(self.config)

        mock_metrics = {"current_metrics": {"acc": 12}, "stage": "test", "epoch": 10}
        model_checkpoint.save_best_models_under_current_metrics(model, mock_metrics)

        # Load checkpoint and initialize model
        model_checkpoint = ModelCheckpoint(self.run_path, self.model_name, "test", self.config, resume=True)
        model2 = model_checkpoint.create_model(dataset, weight_name="acc")

        self.assertEqual(str(model.optimizer.__class__.__name__), str(model2.optimizer.__class__.__name__))
        self.assertEqual(model.optimizer.defaults, model2.optimizer.defaults)
        self.assertEqual(model.schedulers["lr_scheduler"].state_dict(), model2.schedulers["lr_scheduler"].state_dict())
        self.assertEqual(model.schedulers["bn_scheduler"].state_dict(), model2.schedulers["bn_scheduler"].state_dict())

        remove(os.path.join(ROOT, "{}.pt".format(self.model_name)))
        remove(os.path.join(DIR, "{}.pt".format(self.model_name)))
示例#8
0
def test_epoch(
    epoch: int,
    model: BaseModel,
    dataset,
    device,
    tracker: BaseTracker,
    checkpoint: ModelCheckpoint,
    visualizer: Visualizer,
    debugging,
):
    early_break = getattr(debugging, "early_break", False)
    model.eval()

    loaders = dataset.test_dataloaders

    for loader in loaders:
        stage_name = loader.dataset.name
        tracker.reset(stage_name)
        visualizer.reset(epoch, stage_name)
        with Ctq(loader) as tq_test_loader:
            for data in tq_test_loader:
                with torch.no_grad():
                    model.set_input(data, device)
                    model.forward()

                tracker.track(model)
                tq_test_loader.set_postfix(**tracker.get_metrics(),
                                           color=COLORS.TEST_COLOR)

                if visualizer.is_active:
                    visualizer.save_visuals(model.get_current_visuals())

                if early_break:
                    break

        tracker.finalise()
        metrics = tracker.publish(epoch)
        tracker.print_summary()
        checkpoint.save_best_models_under_current_metrics(
            model, metrics, tracker.metric_func)
示例#9
0
    def test_dataset_properties(self):
        self.run_path = os.path.join(DIR, "checkpt")
        if not os.path.exists(self.run_path):
            os.makedirs(self.run_path)

        model_checkpoint = ModelCheckpoint(self.run_path,
                                           self.model_name,
                                           "test",
                                           run_config=self.config,
                                           resume=False)
        model_checkpoint.dataset_properties = {"first": 1, "num_classes": 20}
        model = MockModel()
        metric_func = {"acc": max}
        mock_metrics = {
            "current_metrics": {
                "acc": 12
            },
            "stage": "test",
            "epoch": 10
        }
        metric_func = {"acc": max}
        model_checkpoint.save_best_models_under_current_metrics(
            model, mock_metrics, metric_func)

        ckp = ModelCheckpoint(self.run_path,
                              self.model_name,
                              "test",
                              run_config=self.config,
                              resume=False)

        self.assertEqual(ckp.dataset_properties,
                         model_checkpoint.dataset_properties)
示例#10
0
def eval_epoch(
    epoch: int,
    model: BaseModel,
    dataset,
    device,
    tracker: BaseTracker,
    checkpoint: ModelCheckpoint,
    visualizer: Visualizer,
    debugging,
):

    early_break = getattr(debugging, "early_break", False)

    model.eval()
    tracker.reset("val")
    visualizer.reset(epoch, "val")
    loader = dataset.val_dataloader
    with Ctq(loader) as tq_val_loader:
        for data in tq_val_loader:
            with torch.no_grad():
                model.set_input(data, device)
                model.forward()

            tracker.track(model)
            tq_val_loader.set_postfix(**tracker.get_metrics(),
                                      color=COLORS.VAL_COLOR)

            if visualizer.is_active:
                visualizer.save_visuals(model.get_current_visuals())

            if early_break:
                break

    metrics = tracker.publish(epoch)
    tracker.print_summary()
    checkpoint.save_best_models_under_current_metrics(model, metrics,
                                                      tracker.metric_func)
示例#11
0
    def from_pretrained(model_tag,
                        download=True,
                        out_file=None,
                        weight_name="latest",
                        mock_dataset=True):
        # Convert inputs to registry format

        if PretainedRegistry.MODELS.get(model_tag) is not None:
            url = PretainedRegistry.MODELS.get(model_tag)
        else:
            raise Exception(
                "model_tag {} doesn't exist within available models. Here is the list of pre-trained models {}"
                .format(model_tag, PretainedRegistry.available_models()))

        checkpoint_name = model_tag + ".pt"
        out_file = os.path.join(CHECKPOINT_DIR, checkpoint_name)

        if download:
            download_file(url, out_file)

            weight_name = weight_name if weight_name is not None else "latest"

            checkpoint: ModelCheckpoint = ModelCheckpoint(
                CHECKPOINT_DIR,
                model_tag,
                weight_name if weight_name is not None else "latest",
                resume=False,
            )
            if mock_dataset:
                dataset = checkpoint.dataset_properties.copy()
                if PretainedRegistry.MOCK_USED_PROPERTIES.get(
                        model_tag) is not None:
                    for k, v in PretainedRegistry.MOCK_USED_PROPERTIES.get(
                            model_tag).items():
                        dataset[k] = v

            else:
                dataset = instantiate_dataset(checkpoint.data_config)

            model: BaseModel = checkpoint.create_model(dataset,
                                                       weight_name=weight_name)

            Wandb.set_urls_to_model(model, url)

            BaseDataset.set_transform(model, checkpoint.data_config)

            return model
示例#12
0
    def from_file(path, weight_name="latest", mock_property=None):
        """
        Load a pretrained model trained with torch-points3d from file.
        return a pretrained model
        Parameters
        ----------
        path: str
            path of a pretrained model
        weight_name: str, optional
            name of the weight
        mock_property: dict, optional
            mock dataset

        """
        weight_name = weight_name if weight_name is not None else "latest"
        path_dir, name = os.path.split(path)
        name = name.split(".")[0]  # ModelCheckpoint will add the extension

        checkpoint: ModelCheckpoint = ModelCheckpoint(
            path_dir,
            name,
            weight_name if weight_name is not None else "latest",
            resume=False,
        )
        dataset = checkpoint.data_config

        if mock_property is not None:
            for k, v in mock_property.items():
                dataset[k] = v

        else:
            dataset = instantiate_dataset(checkpoint.data_config)

        model: BaseModel = checkpoint.create_model(dataset,
                                                   weight_name=weight_name)
        BaseDataset.set_transform(model, checkpoint.data_config)
        return model
示例#13
0
    def _initialize_trainer(self):
        # Enable CUDNN BACKEND
        torch.backends.cudnn.enabled = self.enable_cudnn

        if not self.has_training:
            self._cfg.training = self._cfg
            resume = bool(self._cfg.checkpoint_dir)
        else:
            resume = bool(self._cfg.training.checkpoint_dir)

        # Get device
        if self._cfg.training.cuda > -1 and torch.cuda.is_available():
            device = "cuda"
            torch.cuda.set_device(self._cfg.training.cuda)
        else:
            device = "cpu"
        self._device = torch.device(device)
        log.info("DEVICE : {}".format(self._device))

        # Profiling
        if self.profiling:
            # Set the num_workers as torch.utils.bottleneck doesn't work well with it
            self._cfg.training.num_workers = 0

        # Start Wandb if public
        if self.wandb_log:
            Wandb.launch(self._cfg, self._cfg.wandb.public and self.wandb_log)

        # Checkpoint

        self._checkpoint: ModelCheckpoint = ModelCheckpoint(
            self._cfg.training.checkpoint_dir,
            self._cfg.model_name,
            self._cfg.training.weight_name,
            run_config=self._cfg,
            resume=resume,
        )

        # Create model and datasets
        if not self._checkpoint.is_empty:
            self._dataset: BaseDataset = instantiate_dataset(
                self._checkpoint.data_config)
            self._model: BaseModel = self._checkpoint.create_model(
                self._dataset, weight_name=self._cfg.training.weight_name)
        else:
            self._dataset: BaseDataset = instantiate_dataset(self._cfg.data)
            self._model: BaseModel = instantiate_model(
                copy.deepcopy(self._cfg), self._dataset)
            self._model.instantiate_optimizers(self._cfg, "cuda" in device)
            self._model.set_pretrained_weights()
            if not self._checkpoint.validate(self._dataset.used_properties):
                log.warning(
                    "The model will not be able to be used from pretrained weights without the corresponding dataset. Current properties are {}"
                    .format(self._dataset.used_properties))
        self._checkpoint.dataset_properties = self._dataset.used_properties

        log.info(self._model)

        self._model.log_optimizers()
        log.info(
            "Model size = %i",
            sum(param.numel() for param in self._model.parameters()
                if param.requires_grad))

        # Set dataloaders
        self._dataset.create_dataloaders(
            self._model,
            self._cfg.training.batch_size,
            self._cfg.training.shuffle,
            self._cfg.training.num_workers,
            self.precompute_multi_scale,
        )
        log.info(self._dataset)

        # Verify attributes in dataset
        self._model.verify_data(self._dataset.train_dataset[0])

        # Choose selection stage
        selection_stage = getattr(self._cfg, "selection_stage", "")
        self._checkpoint.selection_stage = self._dataset.resolve_saving_stage(
            selection_stage)
        self._tracker: BaseTracker = self._dataset.get_tracker(
            self.wandb_log, self.tensorboard_log)

        if self.wandb_log:
            Wandb.launch(self._cfg, not self._cfg.wandb.public
                         and self.wandb_log)

        # Run training / evaluation
        self._model = self._model.to(self._device)
        if self.has_visualization:
            self._visualizer = Visualizer(self._cfg.visualization,
                                          self._dataset.num_batches,
                                          self._dataset.batch_size,
                                          os.getcwd())
示例#14
0
def main(cfg):
    OmegaConf.set_struct(
        cfg,
        False)  # This allows getattr and hasattr methods to function correctly
    if cfg.pretty_print:
        print(cfg.pretty())

    # Get device
    device = torch.device("cuda" if (
        torch.cuda.is_available() and cfg.training.cuda) else "cpu")
    log.info("DEVICE : {}".format(device))

    # Enable CUDNN BACKEND
    torch.backends.cudnn.enabled = cfg.training.enable_cudnn

    # Profiling
    profiling = getattr(cfg.debugging, "profiling", False)
    if profiling:
        # Set the num_workers as torch.utils.bottleneck doesn't work well with it
        cfg.training.num_workers = 0

    # Start Wandb if public
    launch_wandb(cfg, cfg.wandb.public and cfg.wandb.log)

    # Checkpoint
    checkpoint = ModelCheckpoint(
        cfg.training.checkpoint_dir,
        cfg.model_name,
        cfg.training.weight_name,
        run_config=cfg,
        resume=bool(cfg.training.checkpoint_dir),
    )

    # Create model and datasets
    if not checkpoint.is_empty:
        dataset = instantiate_dataset(checkpoint.data_config)
        model = checkpoint.create_model(dataset,
                                        weight_name=cfg.training.weight_name)
    else:
        dataset = instantiate_dataset(cfg.data)
        model = instantiate_model(cfg, dataset)
        model.instantiate_optimizers(cfg)
    log.info(model)
    model.log_optimizers()
    log.info(
        "Model size = %i",
        sum(param.numel() for param in model.parameters()
            if param.requires_grad))

    # Set dataloaders
    dataset.create_dataloaders(
        model,
        cfg.training.batch_size,
        cfg.training.shuffle,
        cfg.training.num_workers,
        cfg.training.precompute_multi_scale,
    )
    log.info(dataset)

    # Choose selection stage
    selection_stage = getattr(cfg, "selection_stage", "")
    checkpoint.selection_stage = dataset.resolve_saving_stage(selection_stage)
    tracker: BaseTracker = dataset.get_tracker(model, dataset, cfg.wandb.log,
                                               cfg.tensorboard.log)

    launch_wandb(cfg, not cfg.wandb.public and cfg.wandb.log)

    # Run training / evaluation
    model = model.to(device)
    visualizer = Visualizer(cfg.visualization, dataset.num_batches,
                            dataset.batch_size, os.getcwd())
    run(cfg, model, dataset, device, tracker, checkpoint, visualizer)

    # https://github.com/facebookresearch/hydra/issues/440
    hydra._internal.hydra.GlobalHydra.get_state().clear()
    return 0