def test_client_cli_download_model( gordo_project, gordo_single_target, ml_server, tmpdir ): """ Test proper execution of client predict sub-command """ runner = CliRunner() # Empty output directory before downloading assert len(os.listdir(tmpdir)) == 0 out = runner.invoke( gordo_client, args=[ "--project", gordo_project, "download-model", str(tmpdir), "--target", gordo_single_target, ], ) assert ( out.exit_code == 0 ), f"Expected output code 0 got '{out.exit_code}', {out.output}" # Output directory should not be empty any longer assert len(os.listdir(tmpdir)) > 0 model_output_dir = os.path.join(tmpdir, gordo_single_target) assert os.path.isdir(model_output_dir) model = serializer.load(model_output_dir) assert isinstance(model, BaseEstimator)
def test_pipeline_serialization(self): pipe = Pipeline( [ ("pca1", PCA(n_components=10)), ( "fu", FeatureUnion( [ ("pca2", PCA(n_components=3)), ( "pipe", Pipeline( [ ("minmax", MinMaxScaler()), ("truncsvd", TruncatedSVD(n_components=7)), ] ), ), ] ), ), ("ae", KerasAutoEncoder(kind="feedforward_hourglass")), ] ) X = np.random.random(size=100).reshape(10, 10) pipe.fit(X.copy(), X.copy()) with TemporaryDirectory() as tmp: # Test dump metadata = {"key": "value"} serializer.dump(pipe, tmp, metadata=metadata) # Test load from the serialized pipeline above pipe_clone = serializer.load(tmp) metadata_clone = serializer.load_metadata(tmp) # Ensure the metadata was saved and loaded back self.assertEqual(metadata, metadata_clone) # Verify same state for both pipelines y_hat_pipe1 = pipe.predict(X.copy()).flatten() y_hat_pipe2 = pipe_clone.predict(X.copy()).flatten() self.assertTrue(np.allclose(y_hat_pipe1, y_hat_pipe2)) # Now use dumps/loads serialized = serializer.dumps(pipe) pipe_clone = serializer.loads(serialized) # Verify same state for both pipelines y_hat_pipe1 = pipe.predict(X.copy()).flatten() y_hat_pipe2 = pipe_clone.predict(X.copy()).flatten() self.assertTrue(np.allclose(y_hat_pipe1, y_hat_pipe2))
def test_dump_load_models(model): X = np.random.random(size=100).reshape(10, 10) model.fit(X.copy(), X.copy()) model_out = model.predict(X.copy()) with TemporaryDirectory() as tmp: serializer.dump(model, tmp) model_clone = serializer.load(tmp) model_clone_out = model_clone.predict(X.copy()) assert np.allclose(model_out.flatten(), model_clone_out.flatten())
def load_model(directory: str, name: str) -> BaseEstimator: """ Load a given model from the directory by name. Parameters ---------- directory: str Directory to look for the model name: str Name of the model to load, this would be the sub directory within the directory parameter. Returns ------- BaseEstimator """ start_time = timeit.default_timer() model = serializer.load(os.path.join(directory, name)) logger.debug(f"Time to load model: {timeit.default_timer() - start_time}s") return model
def build( self, output_dir: Optional[Union[os.PathLike, str]] = None, model_register_dir: Optional[Union[os.PathLike, str]] = None, replace_cache=False, ) -> Tuple[sklearn.base.BaseEstimator, Machine]: """ Always return a model and its metadata. If ``output_dir`` is supplied, it will save the model there. ``model_register_dir`` points to the model cache directory which it will attempt to read the model from. Supplying both will then have the effect of both; reading from the cache and saving that cached model to the new output directory. Parameters ---------- output_dir: Optional[Union[os.PathLike, str]] A path to where the model will be deposited. model_register_dir: Optional[Union[os.PathLike, str]] A path to a register, see `:func:gordo.util.disk_registry`. If this is None then always build the model, otherwise try to resolve the model from the registry. replace_cache: bool Forces a rebuild of the model, and replaces the entry in the cache with the new model. Returns ------- Tuple[sklearn.base.BaseEstimator, Machine] Built model and an updated ``Machine`` """ if not model_register_dir: model, machine = self._build() else: logger.debug( f"Model caching activated, attempting to read model-location with key " f"{self.cache_key} from register {model_register_dir}") self.cached_model_path = self.check_cache(model_register_dir) if replace_cache: logger.info( "replace_cache=True, deleting any existing cache entry") disk_registry.delete_value(model_register_dir, self.cache_key) self.cached_model_path = None # Load the model from previous cached directory if self.cached_model_path: model = serializer.load(self.cached_model_path) metadata = serializer.load_metadata(self.cached_model_path) metadata["metadata"][ "user_defined"] = self.machine.metadata.user_defined metadata["runtime"] = self.machine.runtime machine = Machine(**metadata) # Otherwise build and cache the model else: model, machine = self._build() self.cached_model_path = self._save_model( model=model, machine=machine, output_dir=output_dir # type: ignore ) logger.info( f"Built model, and deposited at {self.cached_model_path}") logger.info(f"Writing model-location to model registry") disk_registry.write_key( # type: ignore model_register_dir, self.cache_key, self.cached_model_path) # Save model to disk, if we're not building for cv only purposes. if output_dir and (self.machine.evaluation.get("cv_mode") != "cross_val_only"): self.cached_model_path = self._save_model(model=model, machine=machine, output_dir=output_dir) return model, machine