def repair(in_path):
    with open(in_path, "rb") as f:
        checkpoint = pickle.load(f)

    fix_needed = False
    if "lr_scheduler" in checkpoint:
        print(
            "Loading LR scheduler state dict (this might take a few minutes)")
        with io.BytesIO(checkpoint["lr_scheduler"]) as buf:
            lr_sched_state_dict = deserialize_state_dict(buf)

        if "anneal_func" in lr_sched_state_dict:
            fix_needed = True
            del lr_sched_state_dict["anneal_func"]

            with io.BytesIO() as buf:
                serialize_state_dict(buf, lr_sched_state_dict)
                checkpoint["lr_scheduler"] = buf.getvalue()

            out_path = f"{in_path}.repaired"
            print(f"Saving {out_path}")
            with open(out_path, "wb") as f:
                pickle.dump(checkpoint, f)

    if not fix_needed:
        print("This checkpoint does not need repair")
    def test_identical(self):
        model_args = dict(config=dict(
            num_classes=3,
            defaults_sparse=True,
        ))
        model_class = nupic.research.frameworks.pytorch.models.resnets.resnet50
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

        model = create_model(
            model_class=model_class,
            model_args=model_args,
            init_batch_norm=False,
            device=device,
        )

        state = {}
        with io.BytesIO() as buffer:
            serialize_state_dict(buffer, model.state_dict())
            state["model"] = buffer.getvalue()

        with tempfile.NamedTemporaryFile(delete=True) as checkpoint_file:
            pickle.dump(state, checkpoint_file)
            checkpoint_file.flush()

            model2 = create_model(model_class=model_class,
                                  model_args=model_args,
                                  init_batch_norm=False,
                                  device=device,
                                  checkpoint_file=checkpoint_file.name)

            self.assertTrue(compare_models(model, model2, (3, 224, 224)))
Exemplo n.º 3
0
    def setUp(self):

        set_random_seed(20)
        self.model = MNISTSparseCNN()
        self.model.eval()

        # Make all params twice as large to differentiate it from an init-ed model.
        for name, param in self.model.named_parameters():
            if ("cnn" in name or "linear" in name) and ("weight" in name):
                param[:] = param.data * 2

        # self.model.eval()
        self.in_1 = torch.rand(2, 1, 28, 28)
        self.in_2 = torch.rand(2, 1024)
        self.out_full = full_forward(self.model, self.in_1)
        self.out_lower = lower_forward(self.model, self.in_1)
        self.out_upper = upper_forward(self.model, self.in_2)

        # Create temporary results directory.
        self.tempdir = tempfile.TemporaryDirectory()
        self.results_dir = Path(self.tempdir.name) / Path("results")
        self.results_dir.mkdir()

        # Save model state.
        state = {}
        with io.BytesIO() as buffer:
            serialize_state_dict(buffer,
                                 self.model.state_dict(),
                                 compresslevel=-1)
            state["model"] = buffer.getvalue()

        self.checkpoint_path = self.results_dir / Path("mymodel")
        with open(self.checkpoint_path, "wb") as f:
            pickle.dump(state, f)
Exemplo n.º 4
0
    def test_creaate_model_from_checkpoint(self):
        model1 = create_model(model_class=resnet50,
                              model_args={},
                              init_batch_norm=False,
                              device="cpu")

        # Simulate imagenet experiment by changing the weights
        def init(m):
            if hasattr(m, "weight") and m.weight is not None:
                m.weight.data.fill_(0.042)

        model1.apply(init)

        # Save model checkpoint only, ignoring optimizer and other imagenet
        # experiment objects state. See ImagenetExperiment.get_state
        state = {}
        with io.BytesIO() as buffer:
            serialize_state_dict(buffer, model1.state_dict())
            state["model"] = buffer.getvalue()

        with tempfile.NamedTemporaryFile() as checkpoint_file:
            # Ray save checkpoints as pickled dicts
            pickle.dump(state, checkpoint_file)
            checkpoint_file.file.flush()

            # Load model from checkpoint
            model2 = create_model(model_class=resnet50,
                                  model_args={},
                                  init_batch_norm=False,
                                  device="cpu",
                                  checkpoint_file=checkpoint_file.name)

        self.assertTrue(compare_models(model1, model2, (3, 32, 32)))
Exemplo n.º 5
0
    def get_state(self):
        """
        Get experiment serialized state as a dictionary of  byte arrays
        :return: dictionary with "model", "optimizer" and "lr_scheduler" states
        """
        state = {"current_epoch": self.current_epoch}

        # Save state into a byte array to avoid ray's GPU serialization issues
        # See https://github.com/ray-project/ray/issues/5519
        with io.BytesIO() as buffer:
            serialize_state_dict(buffer, self.model.module.state_dict())
            state["model"] = buffer.getvalue()

        with io.BytesIO() as buffer:
            serialize_state_dict(buffer, self.optimizer.state_dict())
            state["optimizer"] = buffer.getvalue()

        if self.lr_scheduler is not None:
            with io.BytesIO() as buffer:
                serialize_state_dict(buffer, self.lr_scheduler.state_dict())
                state["lr_scheduler"] = buffer.getvalue()

        if self.mixed_precision:
            with io.BytesIO() as buffer:
                serialize_state_dict(buffer, amp.state_dict())
                state["amp"] = buffer.getvalue()

        return state
Exemplo n.º 6
0
    def get_state(self):
        """
        Get experiment serialized state as a dictionary of  byte arrays
        :return: dictionary with "model", "optimizer" and "lr_scheduler" states
        """
        state = {
            "current_epoch": self.current_epoch,
            "total_steps": self.total_steps,
        }

        # Save state into a byte array to avoid ray's GPU serialization issues
        # See https://github.com/ray-project/ray/issues/5519
        with io.BytesIO() as buffer:
            algorithm = self.algorithm
            serialize_state_dict(buffer, algorithm.state_dict())
            state["algorithm"] = buffer.getvalue()

        return state
Exemplo n.º 7
0
    def test_serialization(self):
        model1 = simple_linear_net()
        model2 = simple_linear_net()

        def init(m):
            if hasattr(m, "weight") and m.weight is not None:
                m.weight.data.fill_(42.0)

        model2.apply(init)

        with io.BytesIO() as buffer:
            serialize_state_dict(buffer, model1.state_dict())

            buffer.seek(0)
            state_dict = deserialize_state_dict(buffer)
            model2.load_state_dict(state_dict)

        self.assertTrue(compare_models(model1, model2, (32, )))
Exemplo n.º 8
0
def _create_test_checkpoint(file_name):
    """
    Creates a checkpoint file to be used with `test_checkpoint_backward_compatibility`.
    Whenever `test_checkpoint_backward_compatibility` test fails you need to
    create a new checkpoint file from the previous version (commit) using this
    function and update the test to include the new file
    """
    model = _create_test_model()

    # Save model checkpoint only, ignoring optimizer and other imagenet
    # experiment objects state. See ImagenetExperiment.get_state
    state = {}
    with io.BytesIO() as buffer:
        serialize_state_dict(buffer, model.state_dict(), compresslevel=9)
        state["model"] = buffer.getvalue()

    with open(file_name, "wb") as checkpoint_file:
        pickle.dump(state, checkpoint_file)
        checkpoint_file.flush()
Exemplo n.º 9
0
    def test_creaate_model_from_checkpoint(self):
        model1 = _create_test_model()

        # Save model checkpoint only, ignoring optimizer and other imagenet
        # experiment objects state. See ImagenetExperiment.get_state
        state = {}
        with io.BytesIO() as buffer:
            serialize_state_dict(buffer, model1.state_dict())
            state["model"] = buffer.getvalue()

        with tempfile.NamedTemporaryFile() as checkpoint_file:
            # Ray save checkpoints as pickled dicts
            pickle.dump(state, checkpoint_file)
            checkpoint_file.file.flush()

            # Load model from checkpoint
            model2 = create_model(
                model_class=resnet50, model_args=TEST_MODEL_ARGS,
                init_batch_norm=False, device="cpu",
                checkpoint_file=checkpoint_file.name)

        self.assertTrue(compare_models(model1, model2, (3, 32, 32)))
Exemplo n.º 10
0
    def test_identical(self):
        model_args = dict(num_classes=3, )
        model_class = nupic.research.frameworks.pytorch.models.resnets.resnet50
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

        model = model_class(**model_args)
        model.to(device)

        state = {}
        with io.BytesIO() as buffer:
            serialize_state_dict(buffer, model.state_dict())
            state["model"] = buffer.getvalue()

        with tempfile.NamedTemporaryFile(delete=True) as checkpoint_file:
            pickle.dump(state, checkpoint_file)
            checkpoint_file.flush()

            model2 = model_class(**model_args)
            model2.to(device)
            load_state_from_checkpoint(model2, checkpoint_file.name, device)

            self.assertTrue(compare_models(model, model2, (3, 224, 224)))
Exemplo n.º 11
0
    def setUp(self):

        set_random_seed(20)
        self.model = torch.nn.Sequential(
            torch.nn.Linear(8, 8),
            KWinners(8, percent_on=0.1),
        )

        # Create temporary results directory.
        self.tempdir = tempfile.TemporaryDirectory()
        self.results_dir = Path(self.tempdir.name) / Path("results")
        self.results_dir.mkdir()

        # Save model state.
        state = {}
        with io.BytesIO() as buffer:
            serialize_state_dict(buffer, self.model.state_dict(), compresslevel=-1)
            state["model"] = buffer.getvalue()

        self.checkpoint_path = self.results_dir / Path("mymodel")
        with open(self.checkpoint_path, "wb") as f:
            pickle.dump(state, f)
Exemplo n.º 12
0
    def get_state(self):
        """
        Get experiment serialized state as a dictionary of  byte arrays
        :return: dictionary with "model", "optimizer" and "lr_scheduler" states
        """
        state = {
            "current_epoch": self.current_epoch,
        }

        # Save state into a byte array to avoid ray's GPU serialization issues
        # See https://github.com/ray-project/ray/issues/5519
        with io.BytesIO() as buffer:
            model = self.model
            if hasattr(model, "module"):
                # DistributedDataParallel
                model = model.module
            serialize_state_dict(buffer, model.state_dict())
            state["model"] = buffer.getvalue()

        with io.BytesIO() as buffer:
            serialize_state_dict(buffer, self.optimizer.state_dict())
            state["optimizer"] = buffer.getvalue()

        if self.lr_scheduler is not None:
            with io.BytesIO() as buffer:
                state_dict = self.lr_scheduler.state_dict()
                if "anneal_func" in state_dict:
                    # FIXME: This is a workaround for a PyTorch bug.
                    # https://github.com/pytorch/pytorch/issues/42376
                    del state_dict["anneal_func"]
                serialize_state_dict(buffer, state_dict)
                state["lr_scheduler"] = buffer.getvalue()

        if self.mixed_precision:
            with io.BytesIO() as buffer:
                serialize_state_dict(buffer, amp.state_dict())
                state["amp"] = buffer.getvalue()

        return state