def load_model_and_metadata( model_dir_env_var: str) -> typing.Tuple[BaseEstimator, dict]: """ Loads a model and metadata from the path found in ``model_dir_env_var`` environment variable Parameters ---------- model_dir_env_var: str The name of the environment variable which stores the location of the model Returns ------- BaseEstimator, dict Tuple where the 0th element is the model, and the 1st element is the metadata associated with the model """ logger.debug("Determining model location...") model_location = os.getenv(model_dir_env_var) if model_location is None: raise ValueError( f'Environment variable "{model_dir_env_var}" not set!') if not os.path.isdir(model_location): raise NotADirectoryError( f'The supplied directory: "{model_location}" does not exist!') model = serializer.load(model_location) metadata = serializer.load_metadata(model_location) return model, metadata
def load_metadata(directory: str, name: str) -> dict: """ Load metadata from a directory for a given model by name. Parameters ---------- directory: str Directory to look for the model's metadata name: str Name of the model to load metadata for, this would be the sub directory within the directory parameter. Returns ------- dict """ metadata = serializer.load_metadata(os.path.join(directory, name)) return metadata
def build( name, output_dir, model_config, data_config, metadata, model_register_dir, print_cv_scores, model_parameter, model_location_file, data_provider_threads, ): """ Build a model and deposit it into 'output_dir' given the appropriate config settings. \b Parameters ---------- name: str Name given to the model to build output_dir: str Directory to save model & metadata to. model_config: str String containing a yaml which will be parsed to a dict which will be used in initializing the model. Should also contain key 'type' which references the model to use. ie. KerasAutoEncoder data_config: dict kwargs to be used in intializing the dataset. Should also contain kwarg 'type' which references the dataset to use. ie. InfluxBackedDataset metadata: dict Any additional metadata to save under the key 'user-defined' model_register_dir: path Path to a directory which will index existing models and their locations, used for re-using old models instead of rebuilding them. If omitted then always rebuild print_cv_scores: bool Print cross validation scores to stdout model_parameter: List[Tuple] List of model key-values, wheres the values will be injected into the model config wherever there is a jinja variable with the key. model_location_file: str/path Path to a file to open and write the location of the serialized model to. data_provider_threads: int Number of threads to use for the data provider when fetching data. """ # TODO: Move all data related input from environment variable to data_config, # TODO: thereby removing all these data_config['variable'] lines data_config["tag_list"] = data_config.pop("tags") # TODO: Move parsing from here, into the InfluxDataSet class data_config["from_ts"] = dateutil.parser.isoparse( data_config.pop("train_start_date") ) # TODO: Move parsing from here, into the InfluxDataSet class data_config["to_ts"] = dateutil.parser.isoparse(data_config.pop("train_end_date")) # Set default data provider for data config data_config["data_provider"] = DataLakeProvider(threads=data_provider_threads) asset = data_config.get("asset", None) tag_list = normalize_sensor_tags(data_config["tag_list"], asset) data_config["tag_list"] = tag_list logger.info(f"Building, output will be at: {output_dir}") logger.info(f"Raw model config: {model_config}") logger.info(f"Data config: {data_config}") logger.info(f"Register dir: {model_register_dir}") model_parameter = dict(model_parameter) model_config = expand_model(model_config, model_parameter) model_config = yaml.full_load(model_config) # Convert the config into a pipeline, and back into definition to ensure # all default parameters are part of the config. logger.debug(f"Ensuring the passed model config is fully expanded.") model_config = pipeline_into_definition(pipeline_from_definition(model_config)) model_location = provide_saved_model( name, model_config, data_config, metadata, output_dir, model_register_dir ) # If the model is cached but without CV scores then we force a rebuild. We do this # by deleting the entry in the cache and then rerun `provide_saved_model` # (leaving the old model laying around) if print_cv_scores: saved_metadata = load_metadata(model_location) all_scores = get_all_score_strings(saved_metadata) if not all_scores: logger.warning( "Found that loaded model does not have cross validation values " "even though we were asked to print them, clearing cache and " "rebuilding model" ) model_location = provide_saved_model( name, model_config, data_config, metadata, output_dir, model_register_dir, replace_cache=True, ) saved_metadata = load_metadata(model_location) all_scores = get_all_score_strings(saved_metadata) for score in all_scores: print(score) # Write out the model location to this file. model_location_file.write(model_location) return 0
def test_pipeline_serialization(self): pipe = Pipeline([ ("pca1", PCA(n_components=10)), ( "fu", FeatureUnion([ ("pca2", PCA(n_components=3)), ( "pipe", Pipeline([ ("minmax", MinMaxScaler()), ("truncsvd", TruncatedSVD(n_components=7)), ]), ), ]), ), ("ae", KerasAutoEncoder(kind="feedforward_hourglass")), ]) X = np.random.random(size=100).reshape(10, 10) pipe.fit(X.copy(), X.copy()) with TemporaryDirectory() as tmp: # Test dump metadata = {"key": "value"} serializer.dump(pipe, tmp, metadata=metadata) # Assert that a dirs are created for each step in Pipeline expected_structure = OrderedDict([ ("n_step=000-class=sklearn.pipeline.Pipeline", "metadata.json"), ( "n_step=000-class=sklearn.pipeline.Pipeline", OrderedDict([ ( "n_step=000-class=sklearn.decomposition.pca.PCA", "pca1.pkl.gz", ), ( "n_step=001-class=sklearn.pipeline.FeatureUnion", "params.json", ), ( "n_step=001-class=sklearn.pipeline.FeatureUnion", OrderedDict([ ( "n_step=000-class=sklearn.decomposition.pca.PCA", "pca2.pkl.gz", ), ( "n_step=001-class=sklearn.pipeline.Pipeline", OrderedDict([ ( "n_step=000-class=sklearn.preprocessing.data.MinMaxScaler", "minmax.pkl.gz", ), ( "n_step=001-class=sklearn.decomposition.truncated_svd.TruncatedSVD", "truncsvd.pkl.gz", ), ]), ), ]), ), ( "n_step=002-class=gordo_components.model.models.KerasAutoEncoder", "model.h5", ), ( "n_step=002-class=gordo_components.model.models.KerasAutoEncoder", "params.json", ), ]), ), ]) self._structure_verifier(prefix_dir=tmp, structure=expected_structure) # Test load from the serialized pipeline above pipe_clone = serializer.load(tmp) metadata_clone = serializer.load_metadata(tmp) # Ensure the metadata was saved and loaded back self.assertEqual(metadata, metadata_clone) # Verify same state for both pipelines y_hat_pipe1 = pipe.predict(X.copy()).flatten() y_hat_pipe2 = pipe_clone.predict(X.copy()).flatten() self.assertTrue(np.allclose(y_hat_pipe1, y_hat_pipe2)) # Now use dumps/loads serialized = serializer.dumps(pipe) pipe_clone = serializer.loads(serialized) # Verify same state for both pipelines y_hat_pipe1 = pipe.predict(X.copy()).flatten() y_hat_pipe2 = pipe_clone.predict(X.copy()).flatten() self.assertTrue(np.allclose(y_hat_pipe1, y_hat_pipe2))
def build( name, output_dir, model_config, data_config, data_provider, metadata, model_register_dir, print_cv_scores, model_parameter, evaluation_config, ): """ Build a model and deposit it into 'output_dir' given the appropriate config settings. \b Parameters ---------- name: str Name given to the model to build output_dir: str Directory to save model & metadata to. model_config: str String containing a yaml which will be parsed to a dict which will be used in initializing the model. Should also contain key 'type' which references the model to use. ie. KerasAutoEncoder data_config: dict kwargs to be used in intializing the dataset. Should also contain kwarg 'type' which references the dataset to use. ie. InfluxBackedDataset data_provider: str A quoted data provider configuration in JSON/YAML format. Should also contain key 'type' which references the data provider to use. Example:: '{"type": "DataLakeProvider", "storename" : "example_store"}' metadata: dict Any additional metadata to save under the key 'user-defined' model_register_dir: path Path to a directory which will index existing models and their locations, used for re-using old models instead of rebuilding them. If omitted then always rebuild print_cv_scores: bool Print cross validation scores to stdout model_parameter: List[Tuple] List of model key-values, wheres the values will be injected into the model config wherever there is a jinja variable with the key. evaluation_config: dict Dict of parameters which are exposed to build_model. - cv_mode: str String which enables three different modes, represented as a key value in evaluation_config: * cross_val_only: Only perform cross validation * build_only: Skip cross validation and only build the model * full_build: Cross validation and full build of the model, default value Example:: {"cv_mode": "cross_val_only"} """ data_config["tag_list"] = data_config.pop("tags") data_config["from_ts"] = dateutil.parser.isoparse( data_config.pop("train_start_date")) data_config["to_ts"] = dateutil.parser.isoparse( data_config.pop("train_end_date")) # Set default data provider for data config data_config["data_provider"] = data_provider asset = data_config.get("asset", None) tag_list = normalize_sensor_tags(data_config["tag_list"], asset) data_config["tag_list"] = tag_list # Normalize target tag list if present if "target_tag_list" in data_config: target_tag_list = normalize_sensor_tags(data_config["target_tag_list"], asset) data_config["target_tag_list"] = target_tag_list logger.info(f"Building, output will be at: {output_dir}") logger.info(f"Raw model config: {model_config}") logger.info(f"Data config: {data_config}") logger.info(f"Register dir: {model_register_dir}") model_parameter = dict(model_parameter) model_config = expand_model(model_config, model_parameter) model_config = yaml.full_load(model_config) # Convert the config into a pipeline, and back into definition to ensure # all default parameters are part of the config. logger.debug(f"Ensuring the passed model config is fully expanded.") model_config = pipeline_into_definition( pipeline_from_definition(model_config)) logger.debug(f"Fully expanded model config: {model_config}") if evaluation_config["cv_mode"] == "cross_val_only": cache_model_location = None if model_register_dir is not None: cache_key = calculate_model_key(name, model_config, data_config, evaluation_config, metadata=metadata) cache_model_location = check_cache(model_register_dir, cache_key) if cache_model_location: metadata = load_metadata(cache_model_location) else: _, metadata = build_model(name, model_config, data_config, metadata, evaluation_config) else: model_location = provide_saved_model( name, model_config, data_config, metadata, output_dir, model_register_dir, evaluation_config=evaluation_config, ) metadata = load_metadata(model_location) # If the model is cached but without CV scores then we force a rebuild. We do this # by deleting the entry in the cache and then rerun `provide_saved_model` # (leaving the old model laying around) if print_cv_scores: retrieved_metadata = metadata all_scores = get_all_score_strings(retrieved_metadata) if not all_scores: logger.warning( "Found that loaded model does not have cross validation values " "even though we were asked to print them, clearing cache and " "rebuilding model") model_location = provide_saved_model( name, model_config, data_config, metadata, output_dir, model_register_dir, replace_cache=True, evaluation_config=evaluation_config, ) saved_metadata = load_metadata(model_location) all_scores = get_all_score_strings(saved_metadata) for score in all_scores: print(score) return 0