def test_from_into(self):
     """
     Create pipeline from definition, and create from that definition
     """
     self.factories = register_model_builder.factories
     for model in self.factories.keys():
         for model_kind in self.factories[model].keys():
             definition = f"""
                 sklearn.pipeline.Pipeline:
                     steps:
                         - sklearn.decomposition.pca.PCA:
                             n_components: 2
                             copy: true
                             whiten: false
                             svd_solver: auto
                             tol: 0.0
                             iterated_power: auto
                             random_state:
                         - sklearn.preprocessing._function_transformer.FunctionTransformer:
                             func: gordo_components.model.transformer_funcs.general.multiply_by
                             kw_args:
                                 factor: 1
                             inverse_func: gordo_components.model.transformer_funcs.general.multiply_by
                             inv_kw_args:
                                 factor: 1
                         - sklearn.pipeline.FeatureUnion:
                             transformer_list:
                             - sklearn.decomposition.pca.PCA:
                                 n_components: 3
                                 copy: true
                                 whiten: false
                                 svd_solver: auto
                                 tol: 0.0
                                 iterated_power: auto
                                 random_state:
                             - sklearn.pipeline.Pipeline:
                                 steps:
                                 - sklearn.preprocessing.data.MinMaxScaler:
                                     feature_range:
                                     - 0
                                     - 1
                                     copy: true
                                 - sklearn.decomposition.truncated_svd.TruncatedSVD:
                                     n_components: 2
                                     algorithm: randomized
                                     n_iter: 5
                                     random_state:
                                     tol: 0.0
                                 memory:
                             n_jobs: 1
                             transformer_weights:
                         - gordo_components.model.models.{model}:
                             kind: {model_kind}
                     memory:
                 """
             definition = ruamel.yaml.load(definition, Loader=ruamel.yaml.Loader)
             pipe = pipeline_from_definition(definition)
             pipeline_into_definition(pipe)
예제 #2
0
def test_diff_detector_serializability(config):
    """
    Should play well with the gordo serializer
    """
    config = yaml.load(config)

    model = serializer.pipeline_from_definition(config)
    serializer.pipeline_into_definition(model)
    serialized_bytes = serializer.dumps(model)
    serializer.loads(serialized_bytes)
예제 #3
0
def test_imputer_from_definition(config_str: str):
    """
    Ensure it plays well with the gordo serializer
    """
    config = yaml.safe_load(config_str)
    model = serializer.pipeline_from_definition(config)

    if isinstance(model, Pipeline):
        assert isinstance(model.steps[-1][1], InfImputer)
    else:
        assert isinstance(model, InfImputer)

    serializer.pipeline_from_definition(serializer.pipeline_into_definition(model))
    def test_into_from(self):
        """
        Pass Pipeline into definition, and then from that definition
        """
        from gordo_components.model.transformer_funcs.general import multiply_by

        self.factories = register_model_builder.factories
        for model in self.factories.keys():

            for model_kind in self.factories[model].keys():
                pipe = Pipeline(
                    [
                        ("step_0", PCA(n_components=2)),
                        (
                            "step_1",
                            FeatureUnion(
                                [
                                    ("step_0", PCA(n_components=3)),
                                    (
                                        "step_1",
                                        Pipeline(
                                            steps=[
                                                ("step_0", MinMaxScaler((0, 1))),
                                                (
                                                    "step_1",
                                                    TruncatedSVD(n_components=2),
                                                ),
                                            ]
                                        ),
                                    ),
                                ]
                            ),
                        ),
                        (
                            "step_2",
                            FunctionTransformer(
                                func=multiply_by, kw_args={"factor": 1}
                            ),
                        ),
                        (
                            "step_3",
                            pydoc.locate(f"gordo_components.model.models.{model}")(
                                kind=model_kind
                            ),
                        ),
                    ]
                )

                pipeline_from_definition(pipeline_into_definition(pipe))
예제 #5
0
def test_captures_kwarg_to_init():
    """
    Our models allow kwargs which are put into the underlying keras model or to construct
    the underlying model.
    We want to ensure into defintion captures kwargs which are part of the model
    parameters but not part of the __init__ signature
    """
    ae = KerasAutoEncoder(kind="feedforward_hourglass",
                          some_fancy_param="Howdy!")
    definition = pipeline_into_definition(ae)
    parameters = definition[
        f"{KerasAutoEncoder.__module__}.{KerasAutoEncoder.__name__}"]
    assert "some_fancy_param" in parameters
    assert parameters["some_fancy_param"] == "Howdy!"

    # And make sure we can init again
    KerasAutoEncoder(**parameters)
예제 #6
0
def build(
    name,
    output_dir,
    model_config,
    data_config,
    metadata,
    model_register_dir,
    print_cv_scores,
    model_parameter,
    model_location_file,
    data_provider_threads,
):
    """
    Build a model and deposit it into 'output_dir' given the appropriate config
    settings.

    \b
    Parameters
    ----------
    name: str
        Name given to the model to build
    output_dir: str
        Directory to save model & metadata to.
    model_config: str
        String containing a yaml which will be parsed to a dict which will be used in
        initializing the model. Should also contain key 'type' which references the
        model to use. ie. KerasAutoEncoder
    data_config: dict
        kwargs to be used in intializing the dataset. Should also
        contain kwarg 'type' which references the dataset to use. ie. InfluxBackedDataset
    metadata: dict
        Any additional metadata to save under the key 'user-defined'
    model_register_dir: path
        Path to a directory which will index existing models and their locations, used
        for re-using old models instead of rebuilding them. If omitted then always
        rebuild
    print_cv_scores: bool
        Print cross validation scores to stdout
    model_parameter: List[Tuple]
        List of model key-values, wheres the values will be injected into the model
        config wherever there is a jinja variable with the key.
    model_location_file: str/path
        Path to a file to open and write the location of the serialized model to.
    data_provider_threads: int
        Number of threads to use for the data provider when fetching data.
    """

    # TODO: Move all data related input from environment variable to data_config,
    # TODO: thereby removing all these data_config['variable'] lines

    data_config["tag_list"] = data_config.pop("tags")

    # TODO: Move parsing from here, into the InfluxDataSet class
    data_config["from_ts"] = dateutil.parser.isoparse(
        data_config.pop("train_start_date")
    )

    # TODO: Move parsing from here, into the InfluxDataSet class
    data_config["to_ts"] = dateutil.parser.isoparse(data_config.pop("train_end_date"))

    # Set default data provider for data config
    data_config["data_provider"] = DataLakeProvider(threads=data_provider_threads)
    asset = data_config.get("asset", None)
    tag_list = normalize_sensor_tags(data_config["tag_list"], asset)

    data_config["tag_list"] = tag_list

    logger.info(f"Building, output will be at: {output_dir}")
    logger.info(f"Raw model config: {model_config}")
    logger.info(f"Data config: {data_config}")
    logger.info(f"Register dir: {model_register_dir}")

    model_parameter = dict(model_parameter)
    model_config = expand_model(model_config, model_parameter)
    model_config = yaml.full_load(model_config)

    # Convert the config into a pipeline, and back into definition to ensure
    # all default parameters are part of the config.
    logger.debug(f"Ensuring the passed model config is fully expanded.")
    model_config = pipeline_into_definition(pipeline_from_definition(model_config))

    model_location = provide_saved_model(
        name, model_config, data_config, metadata, output_dir, model_register_dir
    )
    # If the model is cached but without CV scores then we force a rebuild. We do this
    # by deleting the entry in the cache and then rerun `provide_saved_model`
    # (leaving the old model laying around)
    if print_cv_scores:
        saved_metadata = load_metadata(model_location)
        all_scores = get_all_score_strings(saved_metadata)
        if not all_scores:
            logger.warning(
                "Found that loaded model does not have cross validation values "
                "even though we were asked to print them, clearing cache and "
                "rebuilding model"
            )

            model_location = provide_saved_model(
                name,
                model_config,
                data_config,
                metadata,
                output_dir,
                model_register_dir,
                replace_cache=True,
            )
            saved_metadata = load_metadata(model_location)
            all_scores = get_all_score_strings(saved_metadata)

        for score in all_scores:
            print(score)

    # Write out the model location to this file.
    model_location_file.write(model_location)
    return 0
    def test_pipeline_into_definition(self):

        expected_definition = (
            """
            sklearn.pipeline.Pipeline:
                steps:
                    - sklearn.decomposition.pca.PCA:
                        n_components: 2
                        copy: true
                        whiten: false
                        svd_solver: auto
                        tol: 0.0
                        iterated_power: auto
                        random_state:
                    - sklearn.pipeline.FeatureUnion:
                        transformer_list:
                        - sklearn.decomposition.pca.PCA:
                            n_components: 3
                            copy: true
                            whiten: false
                            svd_solver: auto
                            tol: 0.0
                            iterated_power: auto
                            random_state:
                        - sklearn.pipeline.Pipeline:
                            steps:
                            - sklearn.preprocessing.data.MinMaxScaler:
                                feature_range:
                                - 0
                                - 1
                                copy: true
                            - sklearn.decomposition.truncated_svd.TruncatedSVD:
                                n_components: 2
                                algorithm: randomized
                                n_iter: 5
                                random_state:
                                tol: 0.0
                            memory:
                        n_jobs:
                        transformer_weights:
                    - gordo_components.model.models.KerasAutoEncoder:
                        kind: feedforward_hourglass
                memory:
            """.rstrip()
            .strip()
            .replace(" ", "")
        )

        for pipe in self.variations_of_same_pipeline:

            definition = pipeline_into_definition(pipe)

            # Using ruamel over PyYaml, better output option support
            stream = io.StringIO()
            ruamel.yaml.dump(definition, stream, Dumper=ruamel.yaml.RoundTripDumper)
            stream.seek(0)

            current_output = stream.read().rstrip().strip().replace(" ", "")
            self.assertEqual(
                current_output,
                expected_definition,
                msg=f"Failed output:\n{current_output}\nExpected:----------------\n{expected_definition}",
            )
예제 #8
0
def build(
    name,
    output_dir,
    model_config,
    data_config,
    data_provider,
    metadata,
    model_register_dir,
    print_cv_scores,
    model_parameter,
    evaluation_config,
):
    """
    Build a model and deposit it into 'output_dir' given the appropriate config
    settings.

    \b
    Parameters
    ----------
    name: str
        Name given to the model to build
    output_dir: str
        Directory to save model & metadata to.
    model_config: str
        String containing a yaml which will be parsed to a dict which will be used in
        initializing the model. Should also contain key 'type' which references the
        model to use. ie. KerasAutoEncoder
    data_config: dict
        kwargs to be used in intializing the dataset. Should also
        contain kwarg 'type' which references the dataset to use. ie. InfluxBackedDataset
    data_provider: str
        A quoted data provider configuration in  JSON/YAML format.
        Should also contain key 'type' which references the data provider to use.

        Example::

          '{"type": "DataLakeProvider", "storename" : "example_store"}'

    metadata: dict
        Any additional metadata to save under the key 'user-defined'
    model_register_dir: path
        Path to a directory which will index existing models and their locations, used
        for re-using old models instead of rebuilding them. If omitted then always
        rebuild
    print_cv_scores: bool
        Print cross validation scores to stdout
    model_parameter: List[Tuple]
        List of model key-values, wheres the values will be injected into the model
        config wherever there is a jinja variable with the key.

    evaluation_config: dict
        Dict of parameters which are exposed to build_model.
            - cv_mode: str
                String which enables three different modes, represented as a key value in evaluation_config:
                * cross_val_only: Only perform cross validation
                * build_only: Skip cross validation and only build the model
                * full_build: Cross validation and full build of the model, default value
                Example::

                    {"cv_mode": "cross_val_only"}
    """

    data_config["tag_list"] = data_config.pop("tags")

    data_config["from_ts"] = dateutil.parser.isoparse(
        data_config.pop("train_start_date"))

    data_config["to_ts"] = dateutil.parser.isoparse(
        data_config.pop("train_end_date"))

    # Set default data provider for data config
    data_config["data_provider"] = data_provider
    asset = data_config.get("asset", None)
    tag_list = normalize_sensor_tags(data_config["tag_list"], asset)

    data_config["tag_list"] = tag_list

    # Normalize target tag list if present
    if "target_tag_list" in data_config:
        target_tag_list = normalize_sensor_tags(data_config["target_tag_list"],
                                                asset)
        data_config["target_tag_list"] = target_tag_list

    logger.info(f"Building, output will be at: {output_dir}")
    logger.info(f"Raw model config: {model_config}")
    logger.info(f"Data config: {data_config}")
    logger.info(f"Register dir: {model_register_dir}")

    model_parameter = dict(model_parameter)
    model_config = expand_model(model_config, model_parameter)
    model_config = yaml.full_load(model_config)

    # Convert the config into a pipeline, and back into definition to ensure
    # all default parameters are part of the config.
    logger.debug(f"Ensuring the passed model config is fully expanded.")
    model_config = pipeline_into_definition(
        pipeline_from_definition(model_config))
    logger.debug(f"Fully expanded model config: {model_config}")

    if evaluation_config["cv_mode"] == "cross_val_only":

        cache_model_location = None
        if model_register_dir is not None:
            cache_key = calculate_model_key(name,
                                            model_config,
                                            data_config,
                                            evaluation_config,
                                            metadata=metadata)
            cache_model_location = check_cache(model_register_dir, cache_key)

        if cache_model_location:
            metadata = load_metadata(cache_model_location)
        else:
            _, metadata = build_model(name, model_config, data_config,
                                      metadata, evaluation_config)

    else:
        model_location = provide_saved_model(
            name,
            model_config,
            data_config,
            metadata,
            output_dir,
            model_register_dir,
            evaluation_config=evaluation_config,
        )
        metadata = load_metadata(model_location)

    # If the model is cached but without CV scores then we force a rebuild. We do this
    # by deleting the entry in the cache and then rerun `provide_saved_model`
    # (leaving the old model laying around)
    if print_cv_scores:
        retrieved_metadata = metadata
        all_scores = get_all_score_strings(retrieved_metadata)
        if not all_scores:
            logger.warning(
                "Found that loaded model does not have cross validation values "
                "even though we were asked to print them, clearing cache and "
                "rebuilding model")

            model_location = provide_saved_model(
                name,
                model_config,
                data_config,
                metadata,
                output_dir,
                model_register_dir,
                replace_cache=True,
                evaluation_config=evaluation_config,
            )
            saved_metadata = load_metadata(model_location)
            all_scores = get_all_score_strings(saved_metadata)

        for score in all_scores:
            print(score)

    return 0