예제 #1
0
 def __set__(self, instance, value):
     if getattr(instance, "_strict", True):
         try:
             from_definition(value)
         except Exception as e:
             raise ValueError(f"Pipeline from definition failed: {e}")
     instance.__dict__[self.name] = value
예제 #2
0
def test_validation_error():
    config = """
    os.rmdir:
        path: /
    """
    definition = yaml.load(config)
    with pytest.raises(ValueError):
        serializer.from_definition(definition)
예제 #3
0
def test_imputer_from_definition(config_str: str):
    """
    Ensure it plays well with the gordo serializer
    """
    config = yaml.safe_load(config_str)
    model = serializer.from_definition(config)

    if isinstance(model, Pipeline):
        assert isinstance(model.steps[-1][1], InfImputer)
    else:
        assert isinstance(model, InfImputer)

    serializer.from_definition(serializer.into_definition(model))
예제 #4
0
    def __call__(self):
        """Build Keras model from specification"""
        if not all(k in self.kind for k in self._expected_keys):
            raise ValueError(
                f"Expected spec to have keys: {self._expected_keys}, but found {self.kind.keys()}"
            )
        logger.debug(f"Building model from spec: {self.kind}")

        model = serializer.from_definition(self.kind["spec"])

        # Load any compile kwargs as well, such as compile.optimizer which may map to class obj
        kwargs = serializer.from_definition(self.kind["compile"])

        model.compile(**kwargs)
        return model
예제 #5
0
def test_raw_keras_part_of_pipeline():
    """
    It should play well, when tucked into a sklearn.pipeline.Pipeline
    """
    X, y = np.random.random((100, 4)), np.random.random((100, 1))

    config_str = """
    sklearn.pipeline.Pipeline:
        steps:
            - sklearn.decomposition.pca.PCA:
                n_components: 4
            - gordo.machine.model.models.KerasRawModelRegressor:
                kind:
                    compile:
                        loss: mse
                        optimizer: adam
                    spec:
                        tensorflow.keras.models.Sequential:
                            layers:
                                - tensorflow.keras.layers.Dense:
                                    units: 4
                                - tensorflow.keras.layers.Dense:
                                    units: 1
    """
    config = yaml.safe_load(config_str)
    pipe = serializer.from_definition(config)
    assert isinstance(pipe, Pipeline)

    pipe.fit(X, y)
    out = pipe.predict(X)
    assert len(out) == len(y)
예제 #6
0
def test_from_definition_test_model():
    config = """
    tests.gordo.serializer.definition_test_model.DefinitionTestModel:
        depth: "300"
    """
    definition = yaml.load(config)
    model = serializer.from_definition(definition)
    assert type(model) == DefinitionTestModel
    assert model.depth == 300
예제 #7
0
def test_into_definition(variations_of_same_pipeline):

    expected_definition = """
        sklearn.pipeline.Pipeline:
            memory: null
            steps:
                - sklearn.decomposition._pca.PCA:
                    copy: true
                    iterated_power: auto
                    n_components: 2
                    random_state: null
                    svd_solver: auto
                    tol: 0.0
                    whiten: false
                - sklearn.pipeline.FeatureUnion:
                    n_jobs: null
                    transformer_list:
                    - sklearn.decomposition._pca.PCA:
                        copy: true
                        iterated_power: auto
                        n_components: 3
                        random_state: null
                        svd_solver: auto
                        tol: 0.0
                        whiten: false
                    - sklearn.pipeline.Pipeline:
                        memory: null
                        steps:
                        - sklearn.preprocessing._data.MinMaxScaler:
                            copy: true
                            feature_range:
                              - 0
                              - 1
                        - sklearn.decomposition._truncated_svd.TruncatedSVD:
                            algorithm: randomized
                            n_components: 2
                            n_iter: 5
                            random_state: null
                            tol: 0.0
                        verbose: false
                    transformer_weights: null
                    verbose: false
                - gordo.machine.model.models.KerasAutoEncoder:
                    kind: feedforward_hourglass
            verbose: false
        """

    expected_definition = yaml.safe_load(expected_definition)

    for pipe in variations_of_same_pipeline:

        definition = into_definition(from_definition(into_definition(pipe)))

        assert json.dumps(definition) == json.dumps(
            expected_definition
        ), f"Failed output:\n{definition}\nExpected:----------------\n{expected_definition}"
예제 #8
0
def test_diff_detector_serializability(config):
    """
    Should play well with the gordo serializer
    """
    config = yaml.load(config)

    model = serializer.from_definition(config)
    serializer.into_definition(model)
    serialized_bytes = serializer.dumps(model)
    serializer.loads(serialized_bytes)
예제 #9
0
def test_load_from_definition(definition):
    """
    Ensure serializer can load models which take other models as parameters.
    """
    X, y = np.random.random((10, 10)), np.random.random((10, 2))
    definition = yaml.load(definition, Loader=yaml.SafeLoader)
    model = serializer.from_definition(definition)
    assert isinstance(model, MultiOutputRegressor)
    model.fit(X, y)
    model.predict(X)
예제 #10
0
def test_into_from():
    """
    Pass Pipeline into definition, and then from that definition
    """
    from gordo.machine.model.transformer_funcs.general import multiply_by

    factories = register_model_builder.factories
    for model in factories.keys():

        for model_kind in factories[model].keys():
            pipe = Pipeline([
                ("step_0", PCA(n_components=2)),
                (
                    "step_1",
                    FeatureUnion([
                        ("step_0", PCA(n_components=3)),
                        (
                            "step_1",
                            Pipeline(steps=[
                                ("step_0", MinMaxScaler((0, 1))),
                                ("step_1", TruncatedSVD(n_components=2)),
                            ]),
                        ),
                    ]),
                ),
                (
                    "step_2",
                    FunctionTransformer(func=multiply_by,
                                        kw_args={"factor": 1}),
                ),
                (
                    "step_3",
                    pydoc.locate(f"gordo.machine.model.models.{model}")(
                        kind=model_kind),
                ),
            ])

            from_definition(into_definition(pipe))
예제 #11
0
    def test_from_definition(self):

        for raw_yaml, model, model_kind in self.setup_gen():
            self.assertTrue(model)
            logger.info(raw_yaml)
            config = yaml.load(raw_yaml)
            logger.debug("{}".format(config))

            config_clone = copy.deepcopy(config)  # To ensure no mutation occurs
            pipe = from_definition(config)

            # Test that the original config matches the one passed; no mutation
            self.assertEqual(config, config_clone)

            # Special tests that defining non-default argument holds for a
            # 'key:  ' is evaled to 'key=None'
            if "memory: /tmp" in raw_yaml:
                self.assertEqual(pipe.steps[2][1].transformer_list[1][1].memory, "/tmp")
            self._verify_pipe(pipe, model, model_kind)
예제 #12
0
파일: build_model.py 프로젝트: flikka/gordo
    def _build(self) -> Tuple[sklearn.base.BaseEstimator, Machine]:
        """
        Build the model using the current state of the Builder

        Returns
        -------
            Tuple[sklearn.base.BaseEstimator, dict]
        """
        # Enforce random seed to 0 if not specified.
        self.set_seed(seed=self.machine.evaluation.get("seed", 0))

        # Get the dataset from config
        logger.debug(
            f"Initializing Dataset with config {self.machine.dataset.to_dict()}"
        )

        dataset = _get_dataset(self.machine.dataset.to_dict())

        logger.debug("Fetching training data")
        start = time.time()

        X, y = dataset.get_data()

        time_elapsed_data = time.time() - start

        # Get the model and dataset
        logger.debug(f"Initializing Model with config: {self.machine.model}")
        model = serializer.from_definition(self.machine.model)

        cv_duration_sec = None

        machine: Machine = Machine(
            name=self.machine.name,
            dataset=self.machine.dataset.to_dict(),
            metadata=self.machine.metadata,
            model=self.machine.model,
            project_name=self.machine.project_name,
            evaluation=self.machine.evaluation,
            runtime=self.machine.runtime,
        )

        split_metadata: Dict[str, Any] = dict()
        scores: Dict[str, Any] = dict()
        if self.machine.evaluation["cv_mode"].lower() in (
                "cross_val_only",
                "full_build",
        ):

            # Build up a metrics list.
            metrics_list = self.metrics_from_list(
                self.machine.evaluation.get("metrics"))

            # Cross validate
            if hasattr(model, "predict"):
                logger.debug("Starting cross validation")
                start = time.time()

                scaler = self.machine.evaluation.get("scoring_scaler")
                metrics_dict = self.build_metrics_dict(metrics_list,
                                                       y,
                                                       scaler=scaler)

                split_obj = serializer.from_definition(
                    self.machine.evaluation.get(
                        "cv",
                        {
                            "sklearn.model_selection.TimeSeriesSplit": {
                                "n_splits": 3
                            }
                        },
                    ))
                # Generate metadata about CV train, test splits
                split_metadata = ModelBuilder.build_split_dict(X, split_obj)

                cv_kwargs = dict(X=X,
                                 y=y,
                                 scoring=metrics_dict,
                                 return_estimator=True,
                                 cv=split_obj)
                if hasattr(model, "cross_validate"):
                    cv = model.cross_validate(**cv_kwargs)
                else:
                    cv = cross_validate(model, **cv_kwargs)

                for metric, test_metric in map(lambda k: (k, f"test_{k}"),
                                               metrics_dict):
                    val = {
                        "fold-mean": cv[test_metric].mean(),
                        "fold-std": cv[test_metric].std(),
                        "fold-max": cv[test_metric].max(),
                        "fold-min": cv[test_metric].min(),
                    }
                    val.update({
                        f"fold-{i + 1}": raw_value
                        for i, raw_value in enumerate(cv[test_metric].tolist())
                    })
                    scores.update({metric: val})

                cv_duration_sec = time.time() - start
            else:
                logger.debug(
                    "Unable to score model, has no attribute 'predict'.")

            # If cross_val_only, return without fitting to the whole dataset
            if self.machine.evaluation["cv_mode"] == "cross_val_only":
                machine.metadata.build_metadata = BuildMetadata(
                    model=ModelBuildMetadata(
                        cross_validation=CrossValidationMetaData(
                            cv_duration_sec=cv_duration_sec,
                            scores=scores,
                            splits=split_metadata,
                        )),
                    dataset=DatasetBuildMetadata(
                        query_duration_sec=time_elapsed_data,
                        dataset_meta=dataset.get_metadata(),
                    ),
                )
                return model, machine

        # Train
        logger.debug("Starting to train model.")
        start = time.time()
        model.fit(X, y)
        time_elapsed_model = time.time() - start

        # Build specific metadata
        machine.metadata.build_metadata = BuildMetadata(
            model=ModelBuildMetadata(
                model_offset=self._determine_offset(model, X),
                model_creation_date=str(
                    datetime.datetime.now(datetime.timezone.utc).astimezone()),
                model_builder_version=__version__,
                model_training_duration_sec=time_elapsed_model,
                cross_validation=CrossValidationMetaData(
                    cv_duration_sec=cv_duration_sec,
                    scores=scores,
                    splits=split_metadata,
                ),
                model_meta=self._extract_metadata_from_model(model),
            ),
            dataset=DatasetBuildMetadata(
                query_duration_sec=time_elapsed_data,
                dataset_meta=dataset.get_metadata(),
            ),
        )
        return model, machine
예제 #13
0
파일: cli.py 프로젝트: fagan2888/gordo
def build(
    machine_config: dict,
    output_dir: str,
    model_register_dir: click.Path,
    print_cv_scores: bool,
    model_parameter: List[Tuple[str, Any]],
):
    """
    Build a model and deposit it into 'output_dir' given the appropriate config
    settings.

    \b
    Parameters
    ----------
    machine_config: dict
        A dict loadable by :class:`gordo.machine.Machine.from_config`
    output_dir: str
        Directory to save model & metadata to.
    model_register_dir: path
        Path to a directory which will index existing models and their locations, used
        for re-using old models instead of rebuilding them. If omitted then always
        rebuild
    print_cv_scores: bool
        Print cross validation scores to stdout
    model_parameter: List[Tuple[str, Any]
        List of model key-values, wheres the values will be injected into the model
        config wherever there is a jinja variable with the key.
    """
    if model_parameter and isinstance(machine_config["model"], str):
        parameters = dict(model_parameter)  # convert lib of tuples to dict
        machine_config["model"] = expand_model(machine_config["model"],
                                               parameters)

    machine: Machine = Machine.from_config(
        machine_config, project_name=machine_config["project_name"])

    logger.info(f"Building, output will be at: {output_dir}")
    logger.info(f"Register dir: {model_register_dir}")

    # Convert the config into a pipeline, and back into definition to ensure
    # all default parameters are part of the config.
    logger.debug(f"Ensuring the passed model config is fully expanded.")
    machine.model = serializer.into_definition(
        serializer.from_definition(machine.model))
    logger.info(f"Fully expanded model config: {machine.model}")

    builder = ModelBuilder(machine=machine)

    try:
        _, machine_out = builder.build(output_dir,
                                       model_register_dir)  # type: ignore

        logger.debug("Reporting built machine.")
        machine_out.report()
        logger.debug("Finished reporting.")

        if print_cv_scores:
            for score in get_all_score_strings(machine_out):
                print(score)

    except Exception as e:
        exit_code = EXCEPTION_TO_EXITCODE.get(e.__class__, 1)
        traceback.print_exc()
        sys.exit(exit_code)
    else:
        return 0
예제 #14
0
def build(
    machine_config: dict,
    output_dir: str,
    model_register_dir: click.Path,
    print_cv_scores: bool,
    model_parameter: List[Tuple[str, Any]],
    exceptions_reporter_file: str,
    exceptions_report_level: str,
):
    """
    Build a model and deposit it into 'output_dir' given the appropriate config
    settings.

    \b
    Parameters
    ----------
    machine_config: dict
        A dict loadable by :class:`gordo.machine.Machine.from_config`
    output_dir: str
        Directory to save model & metadata to.
    model_register_dir: path
        Path to a directory which will index existing models and their locations, used
        for re-using old models instead of rebuilding them. If omitted then always
        rebuild
    print_cv_scores: bool
        Print cross validation scores to stdout
    model_parameter: List[Tuple[str, Any]
        List of model key-values, wheres the values will be injected into the model
        config wherever there is a jinja variable with the key.
    exceptions_reporter_file: str
        JSON output file for exception information
    exceptions_report_level: str
        Details level for exception reporting
    """

    try:
        if model_parameter and isinstance(machine_config["model"], str):
            parameters = dict(model_parameter)  # convert lib of tuples to dict
            machine_config["model"] = expand_model(machine_config["model"],
                                                   parameters)

        machine: Machine = Machine.from_config(
            machine_config, project_name=machine_config["project_name"])

        logger.info(f"Building, output will be at: {output_dir}")
        logger.info(f"Register dir: {model_register_dir}")

        # Convert the config into a pipeline, and back into definition to ensure
        # all default parameters are part of the config.
        logger.debug(f"Ensuring the passed model config is fully expanded.")
        machine.model = serializer.into_definition(
            serializer.from_definition(machine.model))
        logger.info(f"Fully expanded model config: {machine.model}")

        builder = ModelBuilder(machine=machine)

        _, machine_out = builder.build(output_dir,
                                       model_register_dir)  # type: ignore

        logger.debug("Reporting built machine.")
        machine_out.report()
        logger.debug("Finished reporting.")

        if "err" in machine.name:
            raise FileNotFoundError("undefined_file.parquet")

        if print_cv_scores:
            for score in get_all_score_strings(machine_out):
                print(score)

    except Exception:
        traceback.print_exc()
        exc_type, exc_value, exc_traceback = sys.exc_info()

        exit_code = _exceptions_reporter.exception_exit_code(exc_type)
        if exceptions_reporter_file:
            _exceptions_reporter.safe_report(
                cast(
                    ReportLevel,
                    ReportLevel.get_by_name(exceptions_report_level,
                                            ReportLevel.EXIT_CODE),
                ),
                exc_type,
                exc_value,
                exc_traceback,
                exceptions_reporter_file,
                max_message_len=2024 - 500,
            )
        sys.exit(exit_code)
    else:
        return 0
예제 #15
0
파일: base.py 프로젝트: yinxiEquinor/gordo
 def from_dict(cls, config: Dict[str, Any]) -> "BaseReporter":
     """
     Reconstruct the reporter from a dict representation or a single
     import path if it doesn't require any init parameters.
     """
     return serializer.from_definition(config)
예제 #16
0
def test_from_into():
    """
    Create pipeline from definition, and create from that definition
    """
    factories = register_model_builder.factories
    for model in factories.keys():
        for model_kind in factories[model].keys():
            definition = f"""
                sklearn.pipeline.Pipeline:
                    steps:
                        - sklearn.decomposition.PCA:
                            n_components: 2
                            copy: true
                            whiten: false
                            svd_solver: auto
                            tol: 0.0
                            iterated_power: auto
                            random_state:
                        - sklearn.preprocessing._function_transformer.FunctionTransformer:
                            func: gordo.machine.model.transformer_funcs.general.multiply_by
                            kw_args:
                                factor: 1
                            inverse_func: gordo.machine.model.transformer_funcs.general.multiply_by
                            inv_kw_args:
                                factor: 1
                        - sklearn.pipeline.FeatureUnion:
                            transformer_list:
                            - sklearn.decomposition.PCA:
                                n_components: 3
                                copy: true
                                whiten: false
                                svd_solver: auto
                                tol: 0.0
                                iterated_power: auto
                                random_state:
                            - sklearn.pipeline.Pipeline:
                                steps:
                                - sklearn.preprocessing.MinMaxScaler:
                                    feature_range:
                                    - 0
                                    - 1
                                    copy: true
                                - sklearn.decomposition.truncated_svd.TruncatedSVD:
                                    n_components: 2
                                    algorithm: randomized
                                    n_iter: 5
                                    random_state:
                                    tol: 0.0
                                memory:
                                verbose: false
                            n_jobs: 1
                            transformer_weights:
                            verbose: false
                        - gordo.machine.model.models.{model}:
                            kind: {model_kind}
                    memory:
                    verbose: false
                """
            definition = yaml.safe_load(definition)
            pipe = from_definition(definition)
            into_definition(pipe)
예제 #17
0
파일: build_model.py 프로젝트: flikka/gordo
    def build_metrics_dict(
        metrics_list: list,
        y: pd.DataFrame,
        scaler: Optional[Union[TransformerMixin, str]] = None,
    ) -> dict:
        """
        Given a list of metrics that accept a true_y and pred_y as inputs this returns a
        dictionary with keys in the form '{score}-{tag_name}' for each given target tag
        and '{score}' for the average score across all target tags and folds,
        and values being the callable make_scorer(metric_wrapper(score)). Note: score in
        {score}-{tag_name} is a sklearn's score function name with '_' replaced by '-'
        and tag_name corresponds to given target tag name with ' ' replaced by '-'.

        Parameters
        ----------
        metrics_list: list
            List of sklearn score functions
        y: pd.DataFrame
            Target data
        scaler : Optional[Union[TransformerMixin, str]]
            Scaler which will be fitted on y, and used to transform the data before
            scoring. Useful when the metrics are sensitive to the amplitude of the data, and
            you have multiple targets.


        Returns
        -------
            dict
        """
        if scaler:
            if isinstance(scaler, str) or isinstance(scaler, dict):
                scaler = serializer.from_definition(scaler)
            logger.debug("Fitting scaler for scoring purpose")
            scaler.fit(y)

        def _score_factory(metric_func=metrics.r2_score, col_index=0):
            def _score_per_tag(y_true, y_pred):
                # This function extracts the score for each given target_tag to
                # use as scoring argument in sklearn cross_validate, as the scoring
                # must return a single value.
                if hasattr(y_true, "values"):
                    y_true = y_true.values
                if hasattr(y_pred, "values"):
                    y_pred = y_pred.values

                return metric_func(y_true[:, col_index], y_pred[:, col_index])

            return _score_per_tag

        metrics_dict = {}
        for metric in metrics_list:
            for index, col in enumerate(y.columns):
                metric_str = metric.__name__.replace("_", "-")
                metrics_dict.update({
                    metric_str + f'-{col.replace(" ", "-")}':
                    metrics.make_scorer(
                        metric_wrapper(
                            _score_factory(metric_func=metric,
                                           col_index=index),
                            scaler=scaler,
                        ))
                })

            metrics_dict.update({
                metric_str:
                metrics.make_scorer(metric_wrapper(metric, scaler=scaler))
            })
        return metrics_dict