def test_invalid_model(default_globals: dict): """ Test invalid model with 'step' instead of 'steps' """ element_str = """ name: ct-23-0001-machine data_provider: threads: 10 dataset: tags: [GRA-TE -23-0733.PV, GRA-TT -23-0719.PV, GRA-YE -23-0751X.PV] target_tag_list: [GRA-TE -123-456] train_start_date: 2018-01-01T09:00:30Z train_end_date: 2018-01-02T09:00:30Z model: sklearn.pipeline.Pipeline: step: - sklearn.preprocessing.data.MinMaxScaler - gordo.machine.model.models.KerasAutoEncoder: kind: feedforward_hourglass evaluation: scoring_scaler: Null metadata: id: special-id """ element = get_dict_from_yaml(StringIO(element_str)) with pytest.raises(ValueError): Machine.from_config(element, project_name="test-project-name", config_globals=default_globals)
def test_build_cv_mode_cross_val_cache( tmpdir, should_save_model: bool, cv_mode_1: str, cv_mode_2: str, runner: CliRunner, machine: Machine, ): """ Checks that cv_scores uses cache if ran after a full build. Loads the same model, and can print the cv_scores from them. """ logger.info(f"MODEL_CONFIG={json.dumps(machine.model)}") machine.evaluation = cv_mode_1 # type: ignore with temp_env_vars(MACHINE=json.dumps(machine.to_dict()), OUTPUT_DIR=str(tmpdir)): runner.invoke(cli.gordo, ["build"]) machine.evaluation = cv_mode_2 # type: ignore with temp_env_vars(MACHINE=json.dumps(machine.to_dict()), OUTPUT_DIR=str(tmpdir)): runner.invoke(cli.gordo, ["build"]) if should_save_model: assert len(os.listdir(tmpdir)) > 0 else: assert len(os.listdir(tmpdir)) == 0
def __init__(self, machine: Machine): """ Build a model for a given :class:`gordo.workflow.config_elements.machine.Machine` Parameters ---------- machine: Machine Example ------- >>> from gordo_dataset.sensor_tag import SensorTag >>> from gordo.machine import Machine >>> from gordo.dependencies import configure_once >>> configure_once() >>> machine = Machine( ... name="special-model-name", ... model={"sklearn.decomposition.PCA": {"svd_solver": "auto"}}, ... dataset={ ... "type": "RandomDataset", ... "train_start_date": "2017-12-25 06:00:00Z", ... "train_end_date": "2017-12-30 06:00:00Z", ... "tag_list": [SensorTag("Tag 1", None), SensorTag("Tag 2", None)], ... "target_tag_list": [SensorTag("Tag 3", None), SensorTag("Tag 4", None)] ... }, ... project_name='test-proj', ... ) >>> builder = ModelBuilder(machine=machine) >>> model, machine = builder.build() """ # Avoid overwriting the passed machine, copy doesn't work if it holds # reference to a loaded Tensorflow model; .to_dict() serializes it to # a primitive dict representation. self.machine = Machine(**machine.to_dict())
def test_postgres_reporter(postgresdb, metadata): """ Check logging of a machine into postgres """ reporter1 = PostgresReporter(host="localhost") machine1 = Machine(**metadata) # Before inserting, the machine does not exist. with pytest.raises(peewee.DoesNotExist): PostgresMachine.get(PostgresMachine.name == machine1.name) reporter1.report(machine1) record = PostgresMachine.get(PostgresMachine.name == machine1.name) assert record.name == machine1.name # Create another logger to ensure nothing happened to the DB reporter2 = PostgresReporter(host="localhost") machine2 = Machine(**metadata) machine2.name = "another-machine" reporter2.report(machine2) # The first machine is still there record = PostgresMachine.get(PostgresMachine.name == machine1.name) assert record.name == machine1.name # And the second record = PostgresMachine.get(PostgresMachine.name == machine2.name) assert record.name == machine2.name
def test_build_cv_mode(tmpdir, runner: CliRunner, should_save_model: bool, cv_mode: str, machine: Machine): """ Testing build with cv_mode set to full and cross_val_only. Checks that cv_scores are printed and model are only saved when using the default (full) value. """ machine.model = MODEL_CONFIG_WITH_PREDICT machine.evaluation = cv_mode # type: ignore logger.info(f"MODEL_CONFIG={json.dumps(machine.model)}") tmp_model_dir = os.path.join(tmpdir, "tmp") os.makedirs(tmp_model_dir, exist_ok=True) with temp_env_vars(MACHINE=json.dumps(machine.to_dict()), OUTPUT_DIR=tmp_model_dir): result = runner.invoke(cli.gordo, ["build", "--print-cv-scores"]) assert result.exit_code == 0 # Checks that the file is empty or not depending on the mode. if should_save_model: assert len(os.listdir(tmp_model_dir)) != 0 else: assert len(os.listdir(tmp_model_dir)) == 0 # Checks the output contains 'explained-variance_raw-scores' assert "r2-score" in result.output assert "mean-squared-error" in result.output assert "mean-absolute-error" in result.output assert "explained-variance-score" in result.output
def test_builder_with_reporter(postgresdb, metadata): """ Verify a model can take a reporter and .report() will run any given reporters """ reporter = PostgresReporter(host="localhost") metadata["runtime"]["reporters"].append(reporter.to_dict()) machine = Machine(**metadata) with pytest.raises(peewee.DoesNotExist): PostgresMachine.get(PostgresMachine.name == machine.name) machine.report() PostgresMachine.get(PostgresMachine.name == machine.name)
def test_client_get_dataset(gordo_project, metadata, ml_server): data_provider = providers.RandomDataProvider(min_size=10) client = Client(project=gordo_project, data_provider=data_provider) start = isoparse("2016-01-01T00:00:00+00:00") end = isoparse("2016-01-01T12:00:00+00:00") machine = Machine(**metadata) assert type(machine.dataset) is TimeSeriesDataset machine.dataset.row_filter_buffer_size = 12 machine.dataset.n_samples_threshold = 10 client_machine = ClientMachine(**machine.to_dict()) dataset = client._get_dataset(client_machine, start, end) assert dataset.row_filter_buffer_size == 0 assert dataset.n_samples_threshold == 0 assert dataset.low_threshold is None assert dataset.high_threshold is None
def machine(): return Machine( name="test-model", model=MODEL_CONFIG, dataset=DATA_CONFIG, project_name="project-name", )
def test_model_builder_metrics_list(metrics_: Optional[List[str]]): model_config = { "sklearn.multioutput.MultiOutputRegressor": { "estimator": "sklearn.linear_model.LinearRegression" } } data_config = get_random_data() evaluation_config: Dict[str, Any] = {"cv_mode": "full_build"} if metrics_: evaluation_config.update({"metrics": metrics_}) machine = Machine( name="model-name", dataset=data_config, model=model_config, evaluation=evaluation_config, project_name="test", ) _model, machine = ModelBuilder(machine).build() expected_metrics = metrics_ or [ "sklearn.metrics.explained_variance_score", "sklearn.metrics.r2_score", "sklearn.metrics.mean_squared_error", "sklearn.metrics.mean_absolute_error", ] assert all( metric.split(".")[-1].replace("_", "-") in machine.metadata.build_metadata.model.cross_validation.scores for metric in expected_metrics)
def test_setting_seed(seed, model_config): """ Test that we can set the seed and get same results. """ data_config = get_random_data() evaluation_config = {"cv_mode": "full_build", "seed": seed} # Training two instances, without a seed should result in different scores, # while doing it with a seed should result in the same scores. machine = Machine( name="model-name", dataset=data_config, model=model_config, evaluation=evaluation_config, project_name="test", ) _model, machine1 = ModelBuilder(machine).build() _model, machine2 = ModelBuilder(machine).build() df1 = pd.DataFrame.from_dict( machine1.metadata.build_metadata.model.cross_validation.scores) df2 = pd.DataFrame.from_dict( machine2.metadata.build_metadata.model.cross_validation.scores) # Equality depends on the seed being set. if seed: assert df1.equals(df2) else: assert not df1.equals(df2)
def log_machine(mlflow_client: MlflowClient, run_id: str, machine: Machine): """ Send logs to configured MLflow backend Parameters ---------- mlflow_client: MlflowClient Client instance to call logging methods from. run_id: str Unique ID off MLflow Run to log to. machine: Machine Machine to log with MlflowClient. """ # Log machine metrics and params for batch_kwargs in batch_log_items(*get_machine_log_items(machine)): mlflow_client.log_batch(run_id, **batch_kwargs) # Send configs as JSON artifacts try: with tempfile.TemporaryDirectory(dir="./") as tmp_dir: fp = os.path.join(tmp_dir, f"metadata.json") with open(fp, "w") as fh: json.dump(machine.to_dict(), fh, cls=MachineEncoder) mlflow_client.log_artifacts(run_id=run_id, local_dir=tmp_dir) # Map to MlflowLoggingError for coding errors in the model builder except Exception as e: raise MlflowLoggingError(e)
def test_n_splits_from_config(mocked_pipeline_from_definition, cv): """ Test that we can set arbitrary splitters and parameters in the config file which is called by the serializer. """ data_config = get_random_data() evaluation_config = {"cv_mode": "full_build"} if cv: evaluation_config["cv"] = cv model_config = { "sklearn.multioutput.MultiOutputRegressor": { "estimator": "sklearn.ensemble.forest.RandomForestRegressor" } } machine = Machine( name="model-name", dataset=data_config, model=model_config, evaluation=evaluation_config, project_name="test", ) ModelBuilder(machine).build() if cv: mocked_pipeline_from_definition.assert_called_with(cv) else: mocked_pipeline_from_definition.assert_called_with( {"sklearn.model_selection.TimeSeriesSplit": { "n_splits": 3 }})
def report(self, machine: GordoMachine): """ Log a machine to Postgres where top level keys, 'name', 'dataset', 'model', and 'metadata' mappings to BinaryJSON fields. Parameters ---------- machine: gordo.machine.Machine Returns ------- None """ try: with self.db.atomic(): logger.info( f"Inserting machine {machine.name} in sql") # type: ignore # Ensure it's serializable using MachineEncoder record = json.loads( json.dumps(machine.to_dict(), cls=MachineEncoder)) model = dict_to_model(Machine, record, ignore_unknown=True) try: Machine.get(Machine.name == machine.name) except peewee.DoesNotExist: model.save() else: query = Machine.update(**model_to_dict(model)).where( Machine.name == machine.name) query.execute() except Exception as exc: raise PostgresReporterException(exc)
def test_builder_calls_machine_report(mocked_report_method, metadata): """ When building a machine, the Modelbuilder.build should call Machine.report() so that it can run any reporters in the Machine's runtime. """ machine = Machine(**metadata) ModelBuilder(machine).build() assert mocked_report_method.called_once()
def test_scores_metadata(raw_model_config): data_config = get_random_data() model_config = yaml.load(raw_model_config, Loader=yaml.FullLoader) machine = Machine( dataset=data_config, model=model_config, name="model-name", project_name="test" ) model, machine_out = ModelBuilder(machine).build() machine_check(machine_out, False)
def test_get_machine_log_items(metadata): """ Test that dicts are correctly converted to MLflow types or errors raised """ metrics, params = mlu.get_machine_log_items(Machine(**metadata)) assert all(type(m) == Metric for m in metrics) assert all(type(p) == Param for p in params)
def test_overwrite_report(postgresdb, metadata): """ Ensure saving same machine is ok. """ reporter1 = PostgresReporter(host="localhost") reporter2 = PostgresReporter(host="localhost") machine1 = Machine(**metadata) machine2 = Machine(**metadata) reporter1.report(machine1) # Reporting twice should be ok. reporter2.report(machine2) results = PostgresMachine.select().where( PostgresMachine.name == machine1.name) assert len([result for result in results]) == 1
def _machine_from_server(self, name: str, revision: str) -> Machine: resp = self.session.get( f"{self.base_url}/gordo/v0/{self.project_name}/{name}/metadata", params={"revision": revision}, ) metadata = _handle_response( resp=resp, resource_name=f"Machine metadata for {name}") if isinstance(metadata, dict) and metadata.get("metadata", None): return Machine(**metadata.get("metadata", None)) else: raise NotFound(f"Machine {name} not found")
def test_builder_metadata(raw_model_config): """ Ensure the builder works with various model configs and that each has expected/valid metadata results. """ model_config = yaml.load(raw_model_config, Loader=yaml.FullLoader) data_config = get_random_data() machine = Machine( name="model-name", dataset=data_config, model=model_config, project_name="test" ) model, machine_out = ModelBuilder(machine).build() # Check metadata, and only verify 'history' if it's a *Keras* type model machine_check(machine_out, "Keras" in raw_model_config)
def test_provide_saved_model_simple_happy_path(tmpdir): """ Test provide_saved_model with no caching """ model_config = {"sklearn.decomposition.PCA": {"svd_solver": "auto"}} data_config = get_random_data() output_dir = os.path.join(tmpdir, "model") machine = Machine( name="model-name", dataset=data_config, model=model_config, project_name="test" ) ModelBuilder(machine).build(output_dir=output_dir) # Assert the model was saved at the location # Should be model file, and the metadata assert len(os.listdir(output_dir)) == 2
def test_mlflow_context_log_error(MockClient, metadata): """ Test that an error while logging metadata as an artifact raises MlflowLoggingError """ metadata = Machine(**metadata) mock_client = MockClient() mock_client.log_artifacts.side_effect = Exception( "Some unknown exception!") with pytest.raises(mlu.MlflowLoggingError): with mlu.mlflow_context("returns metadata", "unique_key", {}, {}) as ( mlflow_client, run_id, ): mlu.log_machine(mlflow_client, run_id, metadata)
def test_build_cv_mode_build_only(tmpdir, runner: CliRunner, machine: Machine): """ Testing build with cv_mode set to build_only. Checks that OUTPUT_DIR gets a model saved to it. It also checks that the metadata contains cv-duration-sec=None and cv-scores={} """ logger.info(f"MODEL_CONFIG={json.dumps(machine.model)}") machine.evaluation = {"cv_mode": "build_only"} with temp_env_vars(MACHINE=json.dumps(machine.to_dict()), OUTPUT_DIR=str(tmpdir)): metadata_file = f"{os.path.join(tmpdir, 'metadata.json')}" runner.invoke(cli.gordo, ["build"]) # A model has been saved assert len(os.listdir(tmpdir)) != 0 with open(metadata_file) as f: metadata_json = json.loads(f.read()) assert (metadata_json["metadata"]["build_metadata"]["model"] ["cross_validation"]["cv_duration_sec"] is None) assert (metadata_json["metadata"]["build_metadata"]["model"] ["cross_validation"]["scores"] == {})
def test_output_scores_metadata(): data_config = get_random_data() raw_model_config = f""" gordo.machine.model.anomaly.diff.DiffBasedAnomalyDetector: scaler: sklearn.preprocessing.MinMaxScaler base_estimator: sklearn.compose.TransformedTargetRegressor: transformer: sklearn.preprocessing.MinMaxScaler regressor: sklearn.pipeline.Pipeline: steps: - sklearn.preprocessing.MinMaxScaler - gordo.machine.model.models.KerasAutoEncoder: kind: feedforward_hourglass batch_size: 3 compression_factor: 0.5 encoding_layers: 1 func: tanh out_func: linear epochs: 1 """ model_config = yaml.load(raw_model_config, Loader=yaml.FullLoader) machine = Machine(name="model-name", dataset=data_config, model=model_config, project_name="test") model, machine_out = ModelBuilder(machine).build() scores_metadata = machine_out.metadata.build_metadata.model.cross_validation.scores assert (scores_metadata["explained-variance-score-Tag-1"]["fold-mean"] + scores_metadata["explained-variance-score-Tag-2"]["fold-mean"] ) / 2 == pytest.approx( scores_metadata["explained-variance-score"]["fold-mean"]) assert ( scores_metadata["r2-score-Tag-1"]["fold-mean"] + scores_metadata["r2-score-Tag-2"]["fold-mean"]) / 2 == pytest.approx( scores_metadata["r2-score"]["fold-mean"]) assert (scores_metadata["mean-squared-error-Tag-1"]["fold-mean"] + scores_metadata["mean-squared-error-Tag-2"]["fold-mean"] ) / 2 == pytest.approx( scores_metadata["mean-squared-error"]["fold-mean"]) assert (scores_metadata["mean-absolute-error-Tag-1"]["fold-mean"] + scores_metadata["mean-absolute-error-Tag-2"]["fold-mean"] ) / 2 == pytest.approx( scores_metadata["mean-absolute-error"]["fold-mean"])
def test_provide_saved_model_caching_handle_existing_same_dir(tmpdir): """If the model exists in the model register, and the path there is the same as output_dir, output_dir is returned""" model_config = {"sklearn.decomposition.PCA": {"svd_solver": "auto"}} data_config = get_random_data() output_dir = os.path.join(tmpdir, "model") registry_dir = os.path.join(tmpdir, "registry") machine = Machine( name="model-name", dataset=data_config, model=model_config, project_name="test" ) builder = ModelBuilder(machine) builder.build(output_dir=output_dir, model_register_dir=registry_dir) assert builder.cached_model_path == output_dir # Saving to same output_dir as the one saved in the registry just returns the output_dir builder.build(output_dir=output_dir, model_register_dir=registry_dir) assert builder.cached_model_path == output_dir
def _machine(name: str) -> Machine: """ Helper to build a basic Machine, only defining its name """ from gordo_dataset.sensor_tag import SensorTag return Machine.from_config( config={ "name": name, "dataset": { "tag_list": [SensorTag("tag-1", "foo"), SensorTag("tag-2", "foo")], "train_start_date": "2016-01-01T00:00:00Z", "train_end_date": "2016-01-05T00:00:00Z", }, "model": {"sklearn.linear_model.LinearRegression": {}}, }, project_name="test-project", )
def test_mlflow_context_log_metadata(MockClient, tmpdir, metadata): """ Test that call to wrapped function initiates MLflow logging or throws warning """ metadata = Machine(**metadata) mlflow.set_tracking_uri(f"file:{tmpdir}") mock_client = MockClient() mock_client.log_batch.return_value = "test" # Function with a metadata dict returned with mlu.mlflow_context("returns metadata", "unique_key", {}, {}) as ( mlflow_client, run_id, ): mlu.log_machine(mlflow_client, run_id, metadata) assert mock_client.log_batch.called
def test_output_dir(tmpdir): """ Test building of model will create subdirectories for model saving if needed. """ model_config = {"sklearn.decomposition.PCA": {"svd_solver": "auto"}} data_config = get_random_data() output_dir = os.path.join(tmpdir, "some", "sub", "directories") machine = Machine( name="model-name", dataset=data_config, model=model_config, project_name="test" ) builder = ModelBuilder(machine) model, machine_out = builder.build() machine_check(machine_out, False) builder._save_model(model=model, machine=machine_out, output_dir=output_dir) # Assert the model was saved at the location # Should be model file, and the metadata assert len(os.listdir(output_dir)) == 2
def __init__(self, config: dict, project_name: str): default_globals = self.DEFAULT_CONFIG_GLOBALS default_globals["runtime"]["influx"][ # type: ignore "resources"] = _calculate_influx_resources( # type: ignore len(config["machines"])) passed_globals = config.get("globals", dict()) patched_globals = patch_dict(default_globals, passed_globals) if patched_globals.get("runtime"): patched_globals["runtime"] = fix_runtime( patched_globals.get("runtime")) self.project_name = project_name self.machines = [ Machine.from_config(conf, project_name=project_name, config_globals=patched_globals) for conf in config["machines"] ] # type: List[Machine] self.globals = patched_globals
def __init__( self, config: dict, project_name: str, gordo_version: Optional[str] = None, model_builder_env: Optional[dict] = None, ): if gordo_version is None: gordo_version = __version__ default_globals = self.get_default_globals(gordo_version) default_globals["runtime"]["influx"][ # type: ignore "resources" ] = _calculate_influx_resources( # type: ignore len(config["machines"]) ) passed_globals = config.get("globals", dict()) # keeping it for back-compatibility if model_builder_env is not None and not ( passed_globals and "runtime" in passed_globals and "builder" in passed_globals["runtime"] and "env" in passed_globals["runtime"]["builder"] ): if "builder" not in default_globals["runtime"]: default_globals["runtime"]["builder"] = {} default_globals["runtime"]["builder"]["env"] = model_builder_env patched_globals = patch_dict(default_globals, passed_globals) patched_globals = self.prepare_patched_globals(patched_globals) self.project_name = project_name self.machines: List[Machine] = [ Machine.from_config( conf, project_name=project_name, config_globals=patched_globals ) for conf in config["machines"] ] self.globals: dict = patched_globals
def test_provide_saved_model_caching_handle_existing_different_register(tmpdir): """If the model exists in the model register, but the output_dir is not where the model is, the model is copied to the new location, unless the new location already exists. If it does then return it""" model_config = {"sklearn.decomposition.PCA": {"svd_solver": "auto"}} data_config = get_random_data() output_dir1 = os.path.join(tmpdir, "model1") output_dir2 = os.path.join(tmpdir, "model2") registry_dir = os.path.join(tmpdir, "registry") machine = Machine( name="model-name", dataset=data_config, model=model_config, project_name="test" ) builder = ModelBuilder(machine) builder.build(output_dir=output_dir1, model_register_dir=registry_dir) builder.build(output_dir=output_dir2, model_register_dir=registry_dir) assert builder.cached_model_path == output_dir2 builder.build(output_dir=output_dir2, model_register_dir=registry_dir) assert builder.cached_model_path == output_dir2