def test_load_metadata(tmpdir, location): """ Test load_metadata can look in directory given as well as directory above that along with dealing with 'FileNotFoundError' when a non-existent file is given. """ model_dir = os.path.join(tmpdir, "some-model-dir") os.mkdir(model_dir) if location: with open(os.path.join(model_dir, location), "w") as f: json.dump(dict(key="value"), f) assert serializer.load_metadata(model_dir) == dict(key="value") else: # Attempting to load a file which doesn't exist will raise FileNotFoundError with pytest.raises(FileNotFoundError): assert serializer.load_metadata(tmpdir)
def test_pipeline_serialization(self): pipe = Pipeline( [ ("pca1", PCA(n_components=10)), ( "fu", FeatureUnion( [ ("pca2", PCA(n_components=3)), ( "pipe", Pipeline( [ ("minmax", MinMaxScaler()), ("truncsvd", TruncatedSVD(n_components=7)), ] ), ), ] ), ), ("ae", KerasAutoEncoder(kind="feedforward_hourglass")), ] ) X = np.random.random(size=100).reshape(10, 10) pipe.fit(X.copy(), X.copy()) with TemporaryDirectory() as tmp: # Test dump metadata = {"key": "value"} serializer.dump(pipe, tmp, metadata=metadata) # Test load from the serialized pipeline above pipe_clone = serializer.load(tmp) metadata_clone = serializer.load_metadata(tmp) # Ensure the metadata was saved and loaded back self.assertEqual(metadata, metadata_clone) # Verify same state for both pipelines y_hat_pipe1 = pipe.predict(X.copy()).flatten() y_hat_pipe2 = pipe_clone.predict(X.copy()).flatten() self.assertTrue(np.allclose(y_hat_pipe1, y_hat_pipe2)) # Now use dumps/loads serialized = serializer.dumps(pipe) pipe_clone = serializer.loads(serialized) # Verify same state for both pipelines y_hat_pipe1 = pipe.predict(X.copy()).flatten() y_hat_pipe2 = pipe_clone.predict(X.copy()).flatten() self.assertTrue(np.allclose(y_hat_pipe1, y_hat_pipe2))
def _load_compressed_metadata(directory: str, name: str): """ Loads the metadata for model 'name' from directory 'directory', and returns it as a zlib compressed pickle, to use as little space as possible in the cache. Notes ---- Some simple measurement indicated that a typical metadata dict uses 37kb in memory, while pickled it uses 8kb, and pickled-compressed it uses 4kb. """ metadata = serializer.load_metadata(os.path.join(directory, name)) return zlib.compress(pickle.dumps(metadata))
def metadata(trained_model_directories, trained_model_directory): return serializer.load_metadata(trained_model_directory)
def build( self, output_dir: Optional[Union[os.PathLike, str]] = None, model_register_dir: Optional[Union[os.PathLike, str]] = None, replace_cache=False, ) -> Tuple[sklearn.base.BaseEstimator, Machine]: """ Always return a model and its metadata. If ``output_dir`` is supplied, it will save the model there. ``model_register_dir`` points to the model cache directory which it will attempt to read the model from. Supplying both will then have the effect of both; reading from the cache and saving that cached model to the new output directory. Parameters ---------- output_dir: Optional[Union[os.PathLike, str]] A path to where the model will be deposited. model_register_dir: Optional[Union[os.PathLike, str]] A path to a register, see `:func:gordo.util.disk_registry`. If this is None then always build the model, otherwise try to resolve the model from the registry. replace_cache: bool Forces a rebuild of the model, and replaces the entry in the cache with the new model. Returns ------- Tuple[sklearn.base.BaseEstimator, Machine] Built model and an updated ``Machine`` """ if not model_register_dir: model, machine = self._build() else: logger.debug( f"Model caching activated, attempting to read model-location with key " f"{self.cache_key} from register {model_register_dir}") self.cached_model_path = self.check_cache(model_register_dir) if replace_cache: logger.info( "replace_cache=True, deleting any existing cache entry") disk_registry.delete_value(model_register_dir, self.cache_key) self.cached_model_path = None # Load the model from previous cached directory if self.cached_model_path: model = serializer.load(self.cached_model_path) metadata = serializer.load_metadata(self.cached_model_path) metadata["metadata"][ "user_defined"] = self.machine.metadata.user_defined metadata["runtime"] = self.machine.runtime machine = Machine(**metadata) # Otherwise build and cache the model else: model, machine = self._build() self.cached_model_path = self._save_model( model=model, machine=machine, output_dir=output_dir # type: ignore ) logger.info( f"Built model, and deposited at {self.cached_model_path}") logger.info(f"Writing model-location to model registry") disk_registry.write_key( # type: ignore model_register_dir, self.cache_key, self.cached_model_path) # Save model to disk, if we're not building for cv only purposes. if output_dir and (self.machine.evaluation.get("cv_mode") != "cross_val_only"): self.cached_model_path = self._save_model(model=model, machine=machine, output_dir=output_dir) return model, machine