def test_provide_saved_model_caching_handle_existing_same_dir(tmp_dir): """If the model exists in the model register, and the path there is the same as output_dir, output_dir is returned""" model_config = {"sklearn.decomposition.pca.PCA": {"svd_solver": "auto"}} data_config = get_random_data() output_dir = os.path.join(tmp_dir.name, "model") registry_dir = os.path.join(tmp_dir.name, "registry") model_location1 = provide_saved_model( name="model-name", model_config=model_config, data_config=data_config, metadata={}, output_dir=output_dir, model_register_dir=registry_dir, ) assert model_location1 == output_dir # Saving to same output_dir as the one saved in the registry just returns the # output_dir model_location2 = provide_saved_model( name="model-name", model_config=model_config, data_config=data_config, metadata={}, output_dir=output_dir, model_register_dir=registry_dir, ) assert model_location2 == output_dir
def test_provide_saved_model_caching(should_be_equal: bool, metadata: Optional[Dict], tag_list: Optional[List[SensorTag]]): """ Test provide_saved_model with caching and possible cache busting if metadata or tag_list is set. Parameters ---------- should_be_equal : bool Should the two generated models be at the same location or not? metadata Optional metadata which will be used as metadata instead of the default tag_list Possible list of strings which be used as the taglist in the dataset if provided """ if tag_list is None: tag_list = [] if metadata is None: metadata = dict() with TemporaryDirectory() as tmpdir: model_config = { "gordo_components.model.models.KerasAutoEncoder": { "kind": "feedforward_hourglass" } } data_config = get_random_data() output_dir = os.path.join(tmpdir, "model") registry_dir = os.path.join(tmpdir, "registry") model_location = provide_saved_model( model_config=model_config, data_config=data_config, output_dir=output_dir, metadata={}, model_register_dir=registry_dir, ) if tag_list: data_config["tag_list"] = tag_list new_output_dir = os.path.join(tmpdir, "model2") model_location2 = provide_saved_model( model_config=model_config, data_config=data_config, output_dir=new_output_dir, metadata=metadata, model_register_dir=registry_dir, ) if should_be_equal: assert model_location == model_location2 else: assert model_location != model_location2
def test_provide_saved_model_simple_happy_path(self): """ Test provide_saved_model with no caching """ with TemporaryDirectory() as tmpdir: model_config = { "gordo_components.model.models.KerasAutoEncoder": { "kind": "feedforward_hourglass" } } data_config = get_random_data() output_dir = os.path.join(tmpdir, "model") model_location = provide_saved_model( model_config=model_config, data_config=data_config, metadata={}, output_dir=output_dir, ) # Assert the model was saved at the location # using gordo_components.serializer should create some subdir(s) # which start with 'n_step' dirs = [ d for d in os.listdir(model_location) if d.startswith("n_step") ] self.assertGreaterEqual( len(dirs), 1, msg="Expected saving of model to create at " f"least one subdir, but got {len(dirs)}", )
def build(output_dir, model_config, data_config, metadata, model_register_dir): """ Build a model and deposit it into 'output_dir' given the appropriate config settings. \b Parameters ---------- output_dir: str Directory to save model & metadata to. model_config: dict kwargs to be used in initializing the model. Should also contain kwarg 'type' which references the model to use. ie. KerasAutoEncoder data_config: dict kwargs to be used in intializing the dataset. Should also contain kwarg 'type' which references the dataset to use. ie. InfluxBackedDataset metadata: dict Any additional metadata to save under the key 'user-defined' model_register_dir: path Path to a directory which will index existing models and their locations, used for re-using old models instead of rebuilding them. If omitted then always rebuild """ # TODO: Move all data related input from environment variable to data_config, # TODO: thereby removing all these data_config['variable'] lines data_config["tag_list"] = data_config.pop("tags") # TODO: Move parsing from here, into the InfluxDataSet class data_config["from_ts"] = dateutil.parser.isoparse( data_config.pop("train_start_date")) # TODO: Move parsing from here, into the InfluxDataSet class data_config["to_ts"] = dateutil.parser.isoparse( data_config.pop("train_end_date")) # Set default data provider for data config data_config["data_provider"] = DataLakeProvider() asset = data_config.get("asset", None) tag_list = normalize_sensor_tags(data_config["tag_list"], asset) data_config["tag_list"] = tag_list logger.info(f"Building, output will be at: {output_dir}") logger.info(f"Model config: {model_config}") logger.info(f"Data config: {data_config}") logger.info(f"Register dir: {model_register_dir}") model_location = provide_saved_model(model_config, data_config, metadata, output_dir, model_register_dir) with open("/tmp/model-location.txt", "w") as f: f.write(model_location) return 0
def test_provide_saved_model_caching_handle_existing_different_register( tmp_dir): """If the model exists in the model register, but the output_dir is not where the model is, the model is copied to the new location, unless the new location already exists. If it does then return it""" model_config = {"sklearn.decomposition.pca.PCA": {"svd_solver": "auto"}} data_config = get_random_data() output_dir1 = os.path.join(tmp_dir.name, "model1") output_dir2 = os.path.join(tmp_dir.name, "model2") registry_dir = os.path.join(tmp_dir.name, "registry") provide_saved_model( name="model-name", model_config=model_config, data_config=data_config, metadata={}, output_dir=output_dir1, model_register_dir=registry_dir, ) model_location2 = provide_saved_model( name="model-name", model_config=model_config, data_config=data_config, metadata={}, output_dir=output_dir2, model_register_dir=registry_dir, ) assert model_location2 == output_dir2 model_location3 = provide_saved_model( name="model-name", model_config=model_config, data_config=data_config, metadata={}, output_dir=output_dir2, model_register_dir=registry_dir, ) assert model_location3 == output_dir2
def test_provide_saved_model_simple_happy_path(tmp_dir): """ Test provide_saved_model with no caching """ model_config = {"sklearn.decomposition.pca.PCA": {"svd_solver": "auto"}} data_config = get_random_data() output_dir = os.path.join(tmp_dir.name, "model") model_location = provide_saved_model( name="model-name", model_config=model_config, data_config=data_config, metadata={}, output_dir=output_dir, ) # Assert the model was saved at the location # using gordo_components.serializer should create some subdir(s) # which start with 'n_step' dirs = [d for d in os.listdir(model_location) if d.startswith("n_step")] assert ( len(dirs) >= 1 ), "Expected saving of model to create at least one subdir, but got {len(dirs)}"
def build( name, output_dir, model_config, data_config, metadata, model_register_dir, print_cv_scores, model_parameter, model_location_file, data_provider_threads, ): """ Build a model and deposit it into 'output_dir' given the appropriate config settings. \b Parameters ---------- name: str Name given to the model to build output_dir: str Directory to save model & metadata to. model_config: str String containing a yaml which will be parsed to a dict which will be used in initializing the model. Should also contain key 'type' which references the model to use. ie. KerasAutoEncoder data_config: dict kwargs to be used in intializing the dataset. Should also contain kwarg 'type' which references the dataset to use. ie. InfluxBackedDataset metadata: dict Any additional metadata to save under the key 'user-defined' model_register_dir: path Path to a directory which will index existing models and their locations, used for re-using old models instead of rebuilding them. If omitted then always rebuild print_cv_scores: bool Print cross validation scores to stdout model_parameter: List[Tuple] List of model key-values, wheres the values will be injected into the model config wherever there is a jinja variable with the key. model_location_file: str/path Path to a file to open and write the location of the serialized model to. data_provider_threads: int Number of threads to use for the data provider when fetching data. """ # TODO: Move all data related input from environment variable to data_config, # TODO: thereby removing all these data_config['variable'] lines data_config["tag_list"] = data_config.pop("tags") # TODO: Move parsing from here, into the InfluxDataSet class data_config["from_ts"] = dateutil.parser.isoparse( data_config.pop("train_start_date") ) # TODO: Move parsing from here, into the InfluxDataSet class data_config["to_ts"] = dateutil.parser.isoparse(data_config.pop("train_end_date")) # Set default data provider for data config data_config["data_provider"] = DataLakeProvider(threads=data_provider_threads) asset = data_config.get("asset", None) tag_list = normalize_sensor_tags(data_config["tag_list"], asset) data_config["tag_list"] = tag_list logger.info(f"Building, output will be at: {output_dir}") logger.info(f"Raw model config: {model_config}") logger.info(f"Data config: {data_config}") logger.info(f"Register dir: {model_register_dir}") model_parameter = dict(model_parameter) model_config = expand_model(model_config, model_parameter) model_config = yaml.full_load(model_config) # Convert the config into a pipeline, and back into definition to ensure # all default parameters are part of the config. logger.debug(f"Ensuring the passed model config is fully expanded.") model_config = pipeline_into_definition(pipeline_from_definition(model_config)) model_location = provide_saved_model( name, model_config, data_config, metadata, output_dir, model_register_dir ) # If the model is cached but without CV scores then we force a rebuild. We do this # by deleting the entry in the cache and then rerun `provide_saved_model` # (leaving the old model laying around) if print_cv_scores: saved_metadata = load_metadata(model_location) all_scores = get_all_score_strings(saved_metadata) if not all_scores: logger.warning( "Found that loaded model does not have cross validation values " "even though we were asked to print them, clearing cache and " "rebuilding model" ) model_location = provide_saved_model( name, model_config, data_config, metadata, output_dir, model_register_dir, replace_cache=True, ) saved_metadata = load_metadata(model_location) all_scores = get_all_score_strings(saved_metadata) for score in all_scores: print(score) # Write out the model location to this file. model_location_file.write(model_location) return 0
def test_provide_saved_model_caching( should_be_equal: bool, metadata: Optional[Dict], tag_list: Optional[List[SensorTag]], replace_cache, ): """ Test provide_saved_model with caching and possible cache busting if metadata, tag_list, or replace_cache is set. Builds two models and checks if their model-creation-date's are the same, which will be if and only if there is caching. Parameters ---------- should_be_equal : bool Do we expect the two generated models to be at the same location or not? I.e. do we expect caching. metadata Optional metadata which will be used as metadata for the second model. tag_list Optional list of strings which be used as the taglist in the dataset for the second model. replace_cache: bool Should we force a model cache replacement? """ if tag_list is None: tag_list = [] if metadata is None: metadata = dict() with TemporaryDirectory() as tmpdir: model_config = { "sklearn.decomposition.pca.PCA": { "svd_solver": "auto" } } data_config = get_random_data() output_dir = os.path.join(tmpdir, "model") registry_dir = os.path.join(tmpdir, "registry") model_location = provide_saved_model( name="model-name", model_config=model_config, data_config=data_config, output_dir=output_dir, metadata={}, model_register_dir=registry_dir, ) if tag_list: data_config["tag_list"] = tag_list new_output_dir = os.path.join(tmpdir, "model2") model_location2 = provide_saved_model( name="model-name", model_config=model_config, data_config=data_config, output_dir=new_output_dir, metadata=metadata, model_register_dir=registry_dir, replace_cache=replace_cache, ) first_metadata = serializer.load_metadata(str(model_location)) second_metadata = serializer.load_metadata(str(model_location2)) model1_creation_date = first_metadata["model"]["model-creation-date"] model2_creation_date = second_metadata["model"]["model-creation-date"] if should_be_equal: assert model1_creation_date == model2_creation_date else: assert model1_creation_date != model2_creation_date
def build( name, output_dir, model_config, data_config, data_provider, metadata, model_register_dir, print_cv_scores, model_parameter, evaluation_config, ): """ Build a model and deposit it into 'output_dir' given the appropriate config settings. \b Parameters ---------- name: str Name given to the model to build output_dir: str Directory to save model & metadata to. model_config: str String containing a yaml which will be parsed to a dict which will be used in initializing the model. Should also contain key 'type' which references the model to use. ie. KerasAutoEncoder data_config: dict kwargs to be used in intializing the dataset. Should also contain kwarg 'type' which references the dataset to use. ie. InfluxBackedDataset data_provider: str A quoted data provider configuration in JSON/YAML format. Should also contain key 'type' which references the data provider to use. Example:: '{"type": "DataLakeProvider", "storename" : "example_store"}' metadata: dict Any additional metadata to save under the key 'user-defined' model_register_dir: path Path to a directory which will index existing models and their locations, used for re-using old models instead of rebuilding them. If omitted then always rebuild print_cv_scores: bool Print cross validation scores to stdout model_parameter: List[Tuple] List of model key-values, wheres the values will be injected into the model config wherever there is a jinja variable with the key. evaluation_config: dict Dict of parameters which are exposed to build_model. - cv_mode: str String which enables three different modes, represented as a key value in evaluation_config: * cross_val_only: Only perform cross validation * build_only: Skip cross validation and only build the model * full_build: Cross validation and full build of the model, default value Example:: {"cv_mode": "cross_val_only"} """ data_config["tag_list"] = data_config.pop("tags") data_config["from_ts"] = dateutil.parser.isoparse( data_config.pop("train_start_date")) data_config["to_ts"] = dateutil.parser.isoparse( data_config.pop("train_end_date")) # Set default data provider for data config data_config["data_provider"] = data_provider asset = data_config.get("asset", None) tag_list = normalize_sensor_tags(data_config["tag_list"], asset) data_config["tag_list"] = tag_list # Normalize target tag list if present if "target_tag_list" in data_config: target_tag_list = normalize_sensor_tags(data_config["target_tag_list"], asset) data_config["target_tag_list"] = target_tag_list logger.info(f"Building, output will be at: {output_dir}") logger.info(f"Raw model config: {model_config}") logger.info(f"Data config: {data_config}") logger.info(f"Register dir: {model_register_dir}") model_parameter = dict(model_parameter) model_config = expand_model(model_config, model_parameter) model_config = yaml.full_load(model_config) # Convert the config into a pipeline, and back into definition to ensure # all default parameters are part of the config. logger.debug(f"Ensuring the passed model config is fully expanded.") model_config = pipeline_into_definition( pipeline_from_definition(model_config)) logger.debug(f"Fully expanded model config: {model_config}") if evaluation_config["cv_mode"] == "cross_val_only": cache_model_location = None if model_register_dir is not None: cache_key = calculate_model_key(name, model_config, data_config, evaluation_config, metadata=metadata) cache_model_location = check_cache(model_register_dir, cache_key) if cache_model_location: metadata = load_metadata(cache_model_location) else: _, metadata = build_model(name, model_config, data_config, metadata, evaluation_config) else: model_location = provide_saved_model( name, model_config, data_config, metadata, output_dir, model_register_dir, evaluation_config=evaluation_config, ) metadata = load_metadata(model_location) # If the model is cached but without CV scores then we force a rebuild. We do this # by deleting the entry in the cache and then rerun `provide_saved_model` # (leaving the old model laying around) if print_cv_scores: retrieved_metadata = metadata all_scores = get_all_score_strings(retrieved_metadata) if not all_scores: logger.warning( "Found that loaded model does not have cross validation values " "even though we were asked to print them, clearing cache and " "rebuilding model") model_location = provide_saved_model( name, model_config, data_config, metadata, output_dir, model_register_dir, replace_cache=True, evaluation_config=evaluation_config, ) saved_metadata = load_metadata(model_location) all_scores = get_all_score_strings(saved_metadata) for score in all_scores: print(score) return 0