示例#1
0
def test_provide_saved_model_caching_handle_existing_same_dir(tmp_dir):
    """If the model exists in the model register, and the path there is the
    same as output_dir, output_dir is returned"""
    model_config = {"sklearn.decomposition.pca.PCA": {"svd_solver": "auto"}}
    data_config = get_random_data()
    output_dir = os.path.join(tmp_dir.name, "model")
    registry_dir = os.path.join(tmp_dir.name, "registry")

    model_location1 = provide_saved_model(
        name="model-name",
        model_config=model_config,
        data_config=data_config,
        metadata={},
        output_dir=output_dir,
        model_register_dir=registry_dir,
    )

    assert model_location1 == output_dir

    # Saving to same output_dir as the one saved in the registry just returns the
    # output_dir
    model_location2 = provide_saved_model(
        name="model-name",
        model_config=model_config,
        data_config=data_config,
        metadata={},
        output_dir=output_dir,
        model_register_dir=registry_dir,
    )
    assert model_location2 == output_dir
示例#2
0
def test_provide_saved_model_caching(should_be_equal: bool,
                                     metadata: Optional[Dict],
                                     tag_list: Optional[List[SensorTag]]):
    """
    Test provide_saved_model with caching and possible cache busting if metadata or
    tag_list is set.

    Parameters
    ----------
    should_be_equal : bool
        Should the two generated models be at the same location or not?
    metadata
        Optional metadata which will be used as metadata instead of the default
    tag_list
        Possible list of strings which be used as the taglist in the dataset if provided

    """

    if tag_list is None:
        tag_list = []
    if metadata is None:
        metadata = dict()
    with TemporaryDirectory() as tmpdir:

        model_config = {
            "gordo_components.model.models.KerasAutoEncoder": {
                "kind": "feedforward_hourglass"
            }
        }
        data_config = get_random_data()
        output_dir = os.path.join(tmpdir, "model")
        registry_dir = os.path.join(tmpdir, "registry")

        model_location = provide_saved_model(
            model_config=model_config,
            data_config=data_config,
            output_dir=output_dir,
            metadata={},
            model_register_dir=registry_dir,
        )

        if tag_list:
            data_config["tag_list"] = tag_list
        new_output_dir = os.path.join(tmpdir, "model2")
        model_location2 = provide_saved_model(
            model_config=model_config,
            data_config=data_config,
            output_dir=new_output_dir,
            metadata=metadata,
            model_register_dir=registry_dir,
        )
        if should_be_equal:
            assert model_location == model_location2
        else:
            assert model_location != model_location2
示例#3
0
    def test_provide_saved_model_simple_happy_path(self):
        """
        Test provide_saved_model with no caching
        """

        with TemporaryDirectory() as tmpdir:

            model_config = {
                "gordo_components.model.models.KerasAutoEncoder": {
                    "kind": "feedforward_hourglass"
                }
            }
            data_config = get_random_data()
            output_dir = os.path.join(tmpdir, "model")

            model_location = provide_saved_model(
                model_config=model_config,
                data_config=data_config,
                metadata={},
                output_dir=output_dir,
            )

            # Assert the model was saved at the location
            # using gordo_components.serializer should create some subdir(s)
            # which start with 'n_step'
            dirs = [
                d for d in os.listdir(model_location) if d.startswith("n_step")
            ]
            self.assertGreaterEqual(
                len(dirs),
                1,
                msg="Expected saving of model to create at "
                f"least one subdir, but got {len(dirs)}",
            )
示例#4
0
def build(output_dir, model_config, data_config, metadata, model_register_dir):
    """
    Build a model and deposit it into 'output_dir' given the appropriate config
    settings.

    \b
    Parameters
    ----------
    output_dir: str
        Directory to save model & metadata to.
    model_config: dict
        kwargs to be used in initializing the model. Should also
        contain kwarg 'type' which references the model to use. ie. KerasAutoEncoder
    data_config: dict
        kwargs to be used in intializing the dataset. Should also
        contain kwarg 'type' which references the dataset to use. ie. InfluxBackedDataset
    metadata: dict
        Any additional metadata to save under the key 'user-defined'
    model_register_dir: path
        Path to a directory which will index existing models and their locations, used
        for re-using old models instead of rebuilding them. If omitted then always
        rebuild
    """

    # TODO: Move all data related input from environment variable to data_config,
    # TODO: thereby removing all these data_config['variable'] lines

    data_config["tag_list"] = data_config.pop("tags")

    # TODO: Move parsing from here, into the InfluxDataSet class
    data_config["from_ts"] = dateutil.parser.isoparse(
        data_config.pop("train_start_date"))

    # TODO: Move parsing from here, into the InfluxDataSet class
    data_config["to_ts"] = dateutil.parser.isoparse(
        data_config.pop("train_end_date"))

    # Set default data provider for data config
    data_config["data_provider"] = DataLakeProvider()
    asset = data_config.get("asset", None)
    tag_list = normalize_sensor_tags(data_config["tag_list"], asset)

    data_config["tag_list"] = tag_list

    logger.info(f"Building, output will be at: {output_dir}")
    logger.info(f"Model config: {model_config}")
    logger.info(f"Data config: {data_config}")
    logger.info(f"Register dir: {model_register_dir}")

    model_location = provide_saved_model(model_config, data_config, metadata,
                                         output_dir, model_register_dir)
    with open("/tmp/model-location.txt", "w") as f:
        f.write(model_location)
    return 0
示例#5
0
def test_provide_saved_model_caching_handle_existing_different_register(
        tmp_dir):
    """If the model exists in the model register, but the output_dir is not where
    the model is, the model is copied to the new location, unless the new location
    already exists. If it does then return it"""
    model_config = {"sklearn.decomposition.pca.PCA": {"svd_solver": "auto"}}
    data_config = get_random_data()
    output_dir1 = os.path.join(tmp_dir.name, "model1")
    output_dir2 = os.path.join(tmp_dir.name, "model2")

    registry_dir = os.path.join(tmp_dir.name, "registry")

    provide_saved_model(
        name="model-name",
        model_config=model_config,
        data_config=data_config,
        metadata={},
        output_dir=output_dir1,
        model_register_dir=registry_dir,
    )

    model_location2 = provide_saved_model(
        name="model-name",
        model_config=model_config,
        data_config=data_config,
        metadata={},
        output_dir=output_dir2,
        model_register_dir=registry_dir,
    )
    assert model_location2 == output_dir2

    model_location3 = provide_saved_model(
        name="model-name",
        model_config=model_config,
        data_config=data_config,
        metadata={},
        output_dir=output_dir2,
        model_register_dir=registry_dir,
    )
    assert model_location3 == output_dir2
示例#6
0
def test_provide_saved_model_simple_happy_path(tmp_dir):
    """
    Test provide_saved_model with no caching
    """
    model_config = {"sklearn.decomposition.pca.PCA": {"svd_solver": "auto"}}
    data_config = get_random_data()
    output_dir = os.path.join(tmp_dir.name, "model")

    model_location = provide_saved_model(
        name="model-name",
        model_config=model_config,
        data_config=data_config,
        metadata={},
        output_dir=output_dir,
    )

    # Assert the model was saved at the location
    # using gordo_components.serializer should create some subdir(s)
    # which start with 'n_step'
    dirs = [d for d in os.listdir(model_location) if d.startswith("n_step")]
    assert (
        len(dirs) >= 1
    ), "Expected saving of model to create at least one subdir, but got {len(dirs)}"
示例#7
0
def build(
    name,
    output_dir,
    model_config,
    data_config,
    metadata,
    model_register_dir,
    print_cv_scores,
    model_parameter,
    model_location_file,
    data_provider_threads,
):
    """
    Build a model and deposit it into 'output_dir' given the appropriate config
    settings.

    \b
    Parameters
    ----------
    name: str
        Name given to the model to build
    output_dir: str
        Directory to save model & metadata to.
    model_config: str
        String containing a yaml which will be parsed to a dict which will be used in
        initializing the model. Should also contain key 'type' which references the
        model to use. ie. KerasAutoEncoder
    data_config: dict
        kwargs to be used in intializing the dataset. Should also
        contain kwarg 'type' which references the dataset to use. ie. InfluxBackedDataset
    metadata: dict
        Any additional metadata to save under the key 'user-defined'
    model_register_dir: path
        Path to a directory which will index existing models and their locations, used
        for re-using old models instead of rebuilding them. If omitted then always
        rebuild
    print_cv_scores: bool
        Print cross validation scores to stdout
    model_parameter: List[Tuple]
        List of model key-values, wheres the values will be injected into the model
        config wherever there is a jinja variable with the key.
    model_location_file: str/path
        Path to a file to open and write the location of the serialized model to.
    data_provider_threads: int
        Number of threads to use for the data provider when fetching data.
    """

    # TODO: Move all data related input from environment variable to data_config,
    # TODO: thereby removing all these data_config['variable'] lines

    data_config["tag_list"] = data_config.pop("tags")

    # TODO: Move parsing from here, into the InfluxDataSet class
    data_config["from_ts"] = dateutil.parser.isoparse(
        data_config.pop("train_start_date")
    )

    # TODO: Move parsing from here, into the InfluxDataSet class
    data_config["to_ts"] = dateutil.parser.isoparse(data_config.pop("train_end_date"))

    # Set default data provider for data config
    data_config["data_provider"] = DataLakeProvider(threads=data_provider_threads)
    asset = data_config.get("asset", None)
    tag_list = normalize_sensor_tags(data_config["tag_list"], asset)

    data_config["tag_list"] = tag_list

    logger.info(f"Building, output will be at: {output_dir}")
    logger.info(f"Raw model config: {model_config}")
    logger.info(f"Data config: {data_config}")
    logger.info(f"Register dir: {model_register_dir}")

    model_parameter = dict(model_parameter)
    model_config = expand_model(model_config, model_parameter)
    model_config = yaml.full_load(model_config)

    # Convert the config into a pipeline, and back into definition to ensure
    # all default parameters are part of the config.
    logger.debug(f"Ensuring the passed model config is fully expanded.")
    model_config = pipeline_into_definition(pipeline_from_definition(model_config))

    model_location = provide_saved_model(
        name, model_config, data_config, metadata, output_dir, model_register_dir
    )
    # If the model is cached but without CV scores then we force a rebuild. We do this
    # by deleting the entry in the cache and then rerun `provide_saved_model`
    # (leaving the old model laying around)
    if print_cv_scores:
        saved_metadata = load_metadata(model_location)
        all_scores = get_all_score_strings(saved_metadata)
        if not all_scores:
            logger.warning(
                "Found that loaded model does not have cross validation values "
                "even though we were asked to print them, clearing cache and "
                "rebuilding model"
            )

            model_location = provide_saved_model(
                name,
                model_config,
                data_config,
                metadata,
                output_dir,
                model_register_dir,
                replace_cache=True,
            )
            saved_metadata = load_metadata(model_location)
            all_scores = get_all_score_strings(saved_metadata)

        for score in all_scores:
            print(score)

    # Write out the model location to this file.
    model_location_file.write(model_location)
    return 0
示例#8
0
def test_provide_saved_model_caching(
    should_be_equal: bool,
    metadata: Optional[Dict],
    tag_list: Optional[List[SensorTag]],
    replace_cache,
):
    """
    Test provide_saved_model with caching and possible cache busting if metadata,
    tag_list, or replace_cache is set.

    Builds two models and checks if their model-creation-date's are the same,
    which will be if and only if there is caching.

    Parameters
    ----------
    should_be_equal : bool
        Do we expect the two generated models to be at the same location or not? I.e. do
        we expect caching.
    metadata
        Optional metadata which will be used as metadata for the second model.
    tag_list
        Optional list of strings which be used as the taglist in the dataset for the
        second model.
    replace_cache: bool
        Should we force a model cache replacement?

    """

    if tag_list is None:
        tag_list = []
    if metadata is None:
        metadata = dict()
    with TemporaryDirectory() as tmpdir:

        model_config = {
            "sklearn.decomposition.pca.PCA": {
                "svd_solver": "auto"
            }
        }
        data_config = get_random_data()
        output_dir = os.path.join(tmpdir, "model")
        registry_dir = os.path.join(tmpdir, "registry")

        model_location = provide_saved_model(
            name="model-name",
            model_config=model_config,
            data_config=data_config,
            output_dir=output_dir,
            metadata={},
            model_register_dir=registry_dir,
        )

        if tag_list:
            data_config["tag_list"] = tag_list
        new_output_dir = os.path.join(tmpdir, "model2")
        model_location2 = provide_saved_model(
            name="model-name",
            model_config=model_config,
            data_config=data_config,
            output_dir=new_output_dir,
            metadata=metadata,
            model_register_dir=registry_dir,
            replace_cache=replace_cache,
        )

        first_metadata = serializer.load_metadata(str(model_location))
        second_metadata = serializer.load_metadata(str(model_location2))

        model1_creation_date = first_metadata["model"]["model-creation-date"]
        model2_creation_date = second_metadata["model"]["model-creation-date"]
        if should_be_equal:
            assert model1_creation_date == model2_creation_date
        else:
            assert model1_creation_date != model2_creation_date
示例#9
0
def build(
    name,
    output_dir,
    model_config,
    data_config,
    data_provider,
    metadata,
    model_register_dir,
    print_cv_scores,
    model_parameter,
    evaluation_config,
):
    """
    Build a model and deposit it into 'output_dir' given the appropriate config
    settings.

    \b
    Parameters
    ----------
    name: str
        Name given to the model to build
    output_dir: str
        Directory to save model & metadata to.
    model_config: str
        String containing a yaml which will be parsed to a dict which will be used in
        initializing the model. Should also contain key 'type' which references the
        model to use. ie. KerasAutoEncoder
    data_config: dict
        kwargs to be used in intializing the dataset. Should also
        contain kwarg 'type' which references the dataset to use. ie. InfluxBackedDataset
    data_provider: str
        A quoted data provider configuration in  JSON/YAML format.
        Should also contain key 'type' which references the data provider to use.

        Example::

          '{"type": "DataLakeProvider", "storename" : "example_store"}'

    metadata: dict
        Any additional metadata to save under the key 'user-defined'
    model_register_dir: path
        Path to a directory which will index existing models and their locations, used
        for re-using old models instead of rebuilding them. If omitted then always
        rebuild
    print_cv_scores: bool
        Print cross validation scores to stdout
    model_parameter: List[Tuple]
        List of model key-values, wheres the values will be injected into the model
        config wherever there is a jinja variable with the key.

    evaluation_config: dict
        Dict of parameters which are exposed to build_model.
            - cv_mode: str
                String which enables three different modes, represented as a key value in evaluation_config:
                * cross_val_only: Only perform cross validation
                * build_only: Skip cross validation and only build the model
                * full_build: Cross validation and full build of the model, default value
                Example::

                    {"cv_mode": "cross_val_only"}
    """

    data_config["tag_list"] = data_config.pop("tags")

    data_config["from_ts"] = dateutil.parser.isoparse(
        data_config.pop("train_start_date"))

    data_config["to_ts"] = dateutil.parser.isoparse(
        data_config.pop("train_end_date"))

    # Set default data provider for data config
    data_config["data_provider"] = data_provider
    asset = data_config.get("asset", None)
    tag_list = normalize_sensor_tags(data_config["tag_list"], asset)

    data_config["tag_list"] = tag_list

    # Normalize target tag list if present
    if "target_tag_list" in data_config:
        target_tag_list = normalize_sensor_tags(data_config["target_tag_list"],
                                                asset)
        data_config["target_tag_list"] = target_tag_list

    logger.info(f"Building, output will be at: {output_dir}")
    logger.info(f"Raw model config: {model_config}")
    logger.info(f"Data config: {data_config}")
    logger.info(f"Register dir: {model_register_dir}")

    model_parameter = dict(model_parameter)
    model_config = expand_model(model_config, model_parameter)
    model_config = yaml.full_load(model_config)

    # Convert the config into a pipeline, and back into definition to ensure
    # all default parameters are part of the config.
    logger.debug(f"Ensuring the passed model config is fully expanded.")
    model_config = pipeline_into_definition(
        pipeline_from_definition(model_config))
    logger.debug(f"Fully expanded model config: {model_config}")

    if evaluation_config["cv_mode"] == "cross_val_only":

        cache_model_location = None
        if model_register_dir is not None:
            cache_key = calculate_model_key(name,
                                            model_config,
                                            data_config,
                                            evaluation_config,
                                            metadata=metadata)
            cache_model_location = check_cache(model_register_dir, cache_key)

        if cache_model_location:
            metadata = load_metadata(cache_model_location)
        else:
            _, metadata = build_model(name, model_config, data_config,
                                      metadata, evaluation_config)

    else:
        model_location = provide_saved_model(
            name,
            model_config,
            data_config,
            metadata,
            output_dir,
            model_register_dir,
            evaluation_config=evaluation_config,
        )
        metadata = load_metadata(model_location)

    # If the model is cached but without CV scores then we force a rebuild. We do this
    # by deleting the entry in the cache and then rerun `provide_saved_model`
    # (leaving the old model laying around)
    if print_cv_scores:
        retrieved_metadata = metadata
        all_scores = get_all_score_strings(retrieved_metadata)
        if not all_scores:
            logger.warning(
                "Found that loaded model does not have cross validation values "
                "even though we were asked to print them, clearing cache and "
                "rebuilding model")

            model_location = provide_saved_model(
                name,
                model_config,
                data_config,
                metadata,
                output_dir,
                model_register_dir,
                replace_cache=True,
                evaluation_config=evaluation_config,
            )
            saved_metadata = load_metadata(model_location)
            all_scores = get_all_score_strings(saved_metadata)

        for score in all_scores:
            print(score)

    return 0