def test_from_into(self): """ Create pipeline from definition, and create from that definition """ self.factories = register_model_builder.factories for model in self.factories.keys(): for model_kind in self.factories[model].keys(): definition = f""" sklearn.pipeline.Pipeline: steps: - sklearn.decomposition.pca.PCA: n_components: 2 copy: true whiten: false svd_solver: auto tol: 0.0 iterated_power: auto random_state: - sklearn.preprocessing._function_transformer.FunctionTransformer: func: gordo_components.model.transformer_funcs.general.multiply_by kw_args: factor: 1 inverse_func: gordo_components.model.transformer_funcs.general.multiply_by inv_kw_args: factor: 1 - sklearn.pipeline.FeatureUnion: transformer_list: - sklearn.decomposition.pca.PCA: n_components: 3 copy: true whiten: false svd_solver: auto tol: 0.0 iterated_power: auto random_state: - sklearn.pipeline.Pipeline: steps: - sklearn.preprocessing.data.MinMaxScaler: feature_range: - 0 - 1 copy: true - sklearn.decomposition.truncated_svd.TruncatedSVD: n_components: 2 algorithm: randomized n_iter: 5 random_state: tol: 0.0 memory: n_jobs: 1 transformer_weights: - gordo_components.model.models.{model}: kind: {model_kind} memory: """ definition = ruamel.yaml.load(definition, Loader=ruamel.yaml.Loader) pipe = pipeline_from_definition(definition) pipeline_into_definition(pipe)
def test_diff_detector_serializability(config): """ Should play well with the gordo serializer """ config = yaml.load(config) model = serializer.pipeline_from_definition(config) serializer.pipeline_into_definition(model) serialized_bytes = serializer.dumps(model) serializer.loads(serialized_bytes)
def test_imputer_from_definition(config_str: str): """ Ensure it plays well with the gordo serializer """ config = yaml.safe_load(config_str) model = serializer.pipeline_from_definition(config) if isinstance(model, Pipeline): assert isinstance(model.steps[-1][1], InfImputer) else: assert isinstance(model, InfImputer) serializer.pipeline_from_definition(serializer.pipeline_into_definition(model))
def test_into_from(self): """ Pass Pipeline into definition, and then from that definition """ from gordo_components.model.transformer_funcs.general import multiply_by self.factories = register_model_builder.factories for model in self.factories.keys(): for model_kind in self.factories[model].keys(): pipe = Pipeline( [ ("step_0", PCA(n_components=2)), ( "step_1", FeatureUnion( [ ("step_0", PCA(n_components=3)), ( "step_1", Pipeline( steps=[ ("step_0", MinMaxScaler((0, 1))), ( "step_1", TruncatedSVD(n_components=2), ), ] ), ), ] ), ), ( "step_2", FunctionTransformer( func=multiply_by, kw_args={"factor": 1} ), ), ( "step_3", pydoc.locate(f"gordo_components.model.models.{model}")( kind=model_kind ), ), ] ) pipeline_from_definition(pipeline_into_definition(pipe))
def test_captures_kwarg_to_init(): """ Our models allow kwargs which are put into the underlying keras model or to construct the underlying model. We want to ensure into defintion captures kwargs which are part of the model parameters but not part of the __init__ signature """ ae = KerasAutoEncoder(kind="feedforward_hourglass", some_fancy_param="Howdy!") definition = pipeline_into_definition(ae) parameters = definition[ f"{KerasAutoEncoder.__module__}.{KerasAutoEncoder.__name__}"] assert "some_fancy_param" in parameters assert parameters["some_fancy_param"] == "Howdy!" # And make sure we can init again KerasAutoEncoder(**parameters)
def build( name, output_dir, model_config, data_config, metadata, model_register_dir, print_cv_scores, model_parameter, model_location_file, data_provider_threads, ): """ Build a model and deposit it into 'output_dir' given the appropriate config settings. \b Parameters ---------- name: str Name given to the model to build output_dir: str Directory to save model & metadata to. model_config: str String containing a yaml which will be parsed to a dict which will be used in initializing the model. Should also contain key 'type' which references the model to use. ie. KerasAutoEncoder data_config: dict kwargs to be used in intializing the dataset. Should also contain kwarg 'type' which references the dataset to use. ie. InfluxBackedDataset metadata: dict Any additional metadata to save under the key 'user-defined' model_register_dir: path Path to a directory which will index existing models and their locations, used for re-using old models instead of rebuilding them. If omitted then always rebuild print_cv_scores: bool Print cross validation scores to stdout model_parameter: List[Tuple] List of model key-values, wheres the values will be injected into the model config wherever there is a jinja variable with the key. model_location_file: str/path Path to a file to open and write the location of the serialized model to. data_provider_threads: int Number of threads to use for the data provider when fetching data. """ # TODO: Move all data related input from environment variable to data_config, # TODO: thereby removing all these data_config['variable'] lines data_config["tag_list"] = data_config.pop("tags") # TODO: Move parsing from here, into the InfluxDataSet class data_config["from_ts"] = dateutil.parser.isoparse( data_config.pop("train_start_date") ) # TODO: Move parsing from here, into the InfluxDataSet class data_config["to_ts"] = dateutil.parser.isoparse(data_config.pop("train_end_date")) # Set default data provider for data config data_config["data_provider"] = DataLakeProvider(threads=data_provider_threads) asset = data_config.get("asset", None) tag_list = normalize_sensor_tags(data_config["tag_list"], asset) data_config["tag_list"] = tag_list logger.info(f"Building, output will be at: {output_dir}") logger.info(f"Raw model config: {model_config}") logger.info(f"Data config: {data_config}") logger.info(f"Register dir: {model_register_dir}") model_parameter = dict(model_parameter) model_config = expand_model(model_config, model_parameter) model_config = yaml.full_load(model_config) # Convert the config into a pipeline, and back into definition to ensure # all default parameters are part of the config. logger.debug(f"Ensuring the passed model config is fully expanded.") model_config = pipeline_into_definition(pipeline_from_definition(model_config)) model_location = provide_saved_model( name, model_config, data_config, metadata, output_dir, model_register_dir ) # If the model is cached but without CV scores then we force a rebuild. We do this # by deleting the entry in the cache and then rerun `provide_saved_model` # (leaving the old model laying around) if print_cv_scores: saved_metadata = load_metadata(model_location) all_scores = get_all_score_strings(saved_metadata) if not all_scores: logger.warning( "Found that loaded model does not have cross validation values " "even though we were asked to print them, clearing cache and " "rebuilding model" ) model_location = provide_saved_model( name, model_config, data_config, metadata, output_dir, model_register_dir, replace_cache=True, ) saved_metadata = load_metadata(model_location) all_scores = get_all_score_strings(saved_metadata) for score in all_scores: print(score) # Write out the model location to this file. model_location_file.write(model_location) return 0
def test_pipeline_into_definition(self): expected_definition = ( """ sklearn.pipeline.Pipeline: steps: - sklearn.decomposition.pca.PCA: n_components: 2 copy: true whiten: false svd_solver: auto tol: 0.0 iterated_power: auto random_state: - sklearn.pipeline.FeatureUnion: transformer_list: - sklearn.decomposition.pca.PCA: n_components: 3 copy: true whiten: false svd_solver: auto tol: 0.0 iterated_power: auto random_state: - sklearn.pipeline.Pipeline: steps: - sklearn.preprocessing.data.MinMaxScaler: feature_range: - 0 - 1 copy: true - sklearn.decomposition.truncated_svd.TruncatedSVD: n_components: 2 algorithm: randomized n_iter: 5 random_state: tol: 0.0 memory: n_jobs: transformer_weights: - gordo_components.model.models.KerasAutoEncoder: kind: feedforward_hourglass memory: """.rstrip() .strip() .replace(" ", "") ) for pipe in self.variations_of_same_pipeline: definition = pipeline_into_definition(pipe) # Using ruamel over PyYaml, better output option support stream = io.StringIO() ruamel.yaml.dump(definition, stream, Dumper=ruamel.yaml.RoundTripDumper) stream.seek(0) current_output = stream.read().rstrip().strip().replace(" ", "") self.assertEqual( current_output, expected_definition, msg=f"Failed output:\n{current_output}\nExpected:----------------\n{expected_definition}", )
def build( name, output_dir, model_config, data_config, data_provider, metadata, model_register_dir, print_cv_scores, model_parameter, evaluation_config, ): """ Build a model and deposit it into 'output_dir' given the appropriate config settings. \b Parameters ---------- name: str Name given to the model to build output_dir: str Directory to save model & metadata to. model_config: str String containing a yaml which will be parsed to a dict which will be used in initializing the model. Should also contain key 'type' which references the model to use. ie. KerasAutoEncoder data_config: dict kwargs to be used in intializing the dataset. Should also contain kwarg 'type' which references the dataset to use. ie. InfluxBackedDataset data_provider: str A quoted data provider configuration in JSON/YAML format. Should also contain key 'type' which references the data provider to use. Example:: '{"type": "DataLakeProvider", "storename" : "example_store"}' metadata: dict Any additional metadata to save under the key 'user-defined' model_register_dir: path Path to a directory which will index existing models and their locations, used for re-using old models instead of rebuilding them. If omitted then always rebuild print_cv_scores: bool Print cross validation scores to stdout model_parameter: List[Tuple] List of model key-values, wheres the values will be injected into the model config wherever there is a jinja variable with the key. evaluation_config: dict Dict of parameters which are exposed to build_model. - cv_mode: str String which enables three different modes, represented as a key value in evaluation_config: * cross_val_only: Only perform cross validation * build_only: Skip cross validation and only build the model * full_build: Cross validation and full build of the model, default value Example:: {"cv_mode": "cross_val_only"} """ data_config["tag_list"] = data_config.pop("tags") data_config["from_ts"] = dateutil.parser.isoparse( data_config.pop("train_start_date")) data_config["to_ts"] = dateutil.parser.isoparse( data_config.pop("train_end_date")) # Set default data provider for data config data_config["data_provider"] = data_provider asset = data_config.get("asset", None) tag_list = normalize_sensor_tags(data_config["tag_list"], asset) data_config["tag_list"] = tag_list # Normalize target tag list if present if "target_tag_list" in data_config: target_tag_list = normalize_sensor_tags(data_config["target_tag_list"], asset) data_config["target_tag_list"] = target_tag_list logger.info(f"Building, output will be at: {output_dir}") logger.info(f"Raw model config: {model_config}") logger.info(f"Data config: {data_config}") logger.info(f"Register dir: {model_register_dir}") model_parameter = dict(model_parameter) model_config = expand_model(model_config, model_parameter) model_config = yaml.full_load(model_config) # Convert the config into a pipeline, and back into definition to ensure # all default parameters are part of the config. logger.debug(f"Ensuring the passed model config is fully expanded.") model_config = pipeline_into_definition( pipeline_from_definition(model_config)) logger.debug(f"Fully expanded model config: {model_config}") if evaluation_config["cv_mode"] == "cross_val_only": cache_model_location = None if model_register_dir is not None: cache_key = calculate_model_key(name, model_config, data_config, evaluation_config, metadata=metadata) cache_model_location = check_cache(model_register_dir, cache_key) if cache_model_location: metadata = load_metadata(cache_model_location) else: _, metadata = build_model(name, model_config, data_config, metadata, evaluation_config) else: model_location = provide_saved_model( name, model_config, data_config, metadata, output_dir, model_register_dir, evaluation_config=evaluation_config, ) metadata = load_metadata(model_location) # If the model is cached but without CV scores then we force a rebuild. We do this # by deleting the entry in the cache and then rerun `provide_saved_model` # (leaving the old model laying around) if print_cv_scores: retrieved_metadata = metadata all_scores = get_all_score_strings(retrieved_metadata) if not all_scores: logger.warning( "Found that loaded model does not have cross validation values " "even though we were asked to print them, clearing cache and " "rebuilding model") model_location = provide_saved_model( name, model_config, data_config, metadata, output_dir, model_register_dir, replace_cache=True, evaluation_config=evaluation_config, ) saved_metadata = load_metadata(model_location) all_scores = get_all_score_strings(saved_metadata) for score in all_scores: print(score) return 0