def test_invalid_model(default_globals: dict): """ Test invalid model with 'step' instead of 'steps' """ element_str = """ name: ct-23-0001-machine data_provider: threads: 10 dataset: tags: [GRA-TE -23-0733.PV, GRA-TT -23-0719.PV, GRA-YE -23-0751X.PV] target_tag_list: [GRA-TE -123-456] train_start_date: 2018-01-01T09:00:30Z train_end_date: 2018-01-02T09:00:30Z model: sklearn.pipeline.Pipeline: step: - sklearn.preprocessing.data.MinMaxScaler - gordo.machine.model.models.KerasAutoEncoder: kind: feedforward_hourglass evaluation: scoring_scaler: Null metadata: id: special-id """ element = get_dict_from_yaml(StringIO(element_str)) with pytest.raises(ValueError): Machine.from_config(element, project_name="test-project-name", config_globals=default_globals)
def _machine(name: str) -> Machine: """ Helper to build a basic Machine, only defining its name """ from gordo_dataset.sensor_tag import SensorTag return Machine.from_config( config={ "name": name, "dataset": { "tag_list": [SensorTag("tag-1", "foo"), SensorTag("tag-2", "foo")], "train_start_date": "2016-01-01T00:00:00Z", "train_end_date": "2016-01-05T00:00:00Z", }, "model": {"sklearn.linear_model.LinearRegression": {}}, }, project_name="test-project", )
def __init__(self, config: dict, project_name: str): default_globals = self.DEFAULT_CONFIG_GLOBALS default_globals["runtime"]["influx"][ # type: ignore "resources"] = _calculate_influx_resources( # type: ignore len(config["machines"])) passed_globals = config.get("globals", dict()) patched_globals = patch_dict(default_globals, passed_globals) if patched_globals.get("runtime"): patched_globals["runtime"] = fix_runtime( patched_globals.get("runtime")) self.project_name = project_name self.machines = [ Machine.from_config(conf, project_name=project_name, config_globals=patched_globals) for conf in config["machines"] ] # type: List[Machine] self.globals = patched_globals
def __init__( self, config: dict, project_name: str, gordo_version: Optional[str] = None, model_builder_env: Optional[dict] = None, ): if gordo_version is None: gordo_version = __version__ default_globals = self.get_default_globals(gordo_version) default_globals["runtime"]["influx"][ # type: ignore "resources" ] = _calculate_influx_resources( # type: ignore len(config["machines"]) ) passed_globals = config.get("globals", dict()) # keeping it for back-compatibility if model_builder_env is not None and not ( passed_globals and "runtime" in passed_globals and "builder" in passed_globals["runtime"] and "env" in passed_globals["runtime"]["builder"] ): if "builder" not in default_globals["runtime"]: default_globals["runtime"]["builder"] = {} default_globals["runtime"]["builder"]["env"] = model_builder_env patched_globals = patch_dict(default_globals, passed_globals) patched_globals = self.prepare_patched_globals(patched_globals) self.project_name = project_name self.machines: List[Machine] = [ Machine.from_config( conf, project_name=project_name, config_globals=patched_globals ) for conf in config["machines"] ] self.globals: dict = patched_globals
def build( machine_config: dict, output_dir: str, model_register_dir: click.Path, print_cv_scores: bool, model_parameter: List[Tuple[str, Any]], exceptions_reporter_file: str, exceptions_report_level: str, ): """ Build a model and deposit it into 'output_dir' given the appropriate config settings. \b Parameters ---------- machine_config: dict A dict loadable by :class:`gordo.machine.Machine.from_config` output_dir: str Directory to save model & metadata to. model_register_dir: path Path to a directory which will index existing models and their locations, used for re-using old models instead of rebuilding them. If omitted then always rebuild print_cv_scores: bool Print cross validation scores to stdout model_parameter: List[Tuple[str, Any] List of model key-values, wheres the values will be injected into the model config wherever there is a jinja variable with the key. exceptions_reporter_file: str JSON output file for exception information exceptions_report_level: str Details level for exception reporting """ try: if model_parameter and isinstance(machine_config["model"], str): parameters = dict(model_parameter) # convert lib of tuples to dict machine_config["model"] = expand_model(machine_config["model"], parameters) machine: Machine = Machine.from_config( machine_config, project_name=machine_config["project_name"]) logger.info(f"Building, output will be at: {output_dir}") logger.info(f"Register dir: {model_register_dir}") # Convert the config into a pipeline, and back into definition to ensure # all default parameters are part of the config. logger.debug(f"Ensuring the passed model config is fully expanded.") machine.model = serializer.into_definition( serializer.from_definition(machine.model)) logger.info(f"Fully expanded model config: {machine.model}") builder = ModelBuilder(machine=machine) _, machine_out = builder.build(output_dir, model_register_dir) # type: ignore logger.debug("Reporting built machine.") machine_out.report() logger.debug("Finished reporting.") if "err" in machine.name: raise FileNotFoundError("undefined_file.parquet") if print_cv_scores: for score in get_all_score_strings(machine_out): print(score) except Exception: traceback.print_exc() exc_type, exc_value, exc_traceback = sys.exc_info() exit_code = _exceptions_reporter.exception_exit_code(exc_type) if exceptions_reporter_file: _exceptions_reporter.safe_report( cast( ReportLevel, ReportLevel.get_by_name(exceptions_report_level, ReportLevel.EXIT_CODE), ), exc_type, exc_value, exc_traceback, exceptions_reporter_file, max_message_len=2024 - 500, ) sys.exit(exit_code) else: return 0
def test_influx_forwarder(influxdb, influxdb_uri, sensors, sensors_str): """ Test that the forwarder creates correct points from a multi-indexed series """ with patch.object(sensor_tag, "_asset_from_tag_name", return_value="default"): machine = Machine.from_config( config={ "name": "some-target-name", "dataset": { "tags": sensors_str, "target_tag_list": sensors_str, "train_start_date": "2016-01-01T00:00:00Z", "train_end_date": "2016-01-05T00:00:00Z", "resolution": "10T", }, "model": "sklearn.linear_model.LinearRegression", }, project_name="test-project", ) # Feature outs which match length of tags # These should then be re-mapped to the sensor tag names input_keys = [("name1", i) for i, _ in enumerate(sensors)] # Feature outs which don't match the length of the tags # These will be kept at 0..N as field names # output_keys = [("name2", f"sensor_{i}") for i in range(len(sensors) * 2)] output_keys = [("name2", i) for i in range(len(sensors) * 2)] # Assign all keys unique numbers df = get_test_data(pd.MultiIndex.from_tuples(input_keys + output_keys)) # Create the forwarder and forward the 'predictions' to influx. forwarder = ForwardPredictionsIntoInflux( destination_influx_uri=influxdb_uri) forwarder.forward_predictions(predictions=df, machine=machine) # Client to manually verify the points written client = influx_client_from_uri(influxdb_uri, dataframe_client=True) name1_results = client.query("SELECT * FROM name1")["name1"] # Should have column names: 'machine', 'sensor_name', 'sensor_value' assert all(c in name1_results.columns for c in ["machine", "sensor_name", "sensor_value"]) # Check that values returned from InfluxDB match what put in for inputs for i, tag in enumerate(sensors_str): results_mask = name1_results["sensor_name"] == tag assert np.allclose(df[("name1", i)].values, name1_results[results_mask]["sensor_value"].values) # Now check the other top level name "name2" is a measurement with the correct points written name2_results = client.query("SELECT * FROM name2")["name2"] # Should have the same names as tags, since all top levels get stacked into the same resulting columns assert all([ c in name2_results.columns for c in ["machine", "sensor_name", "sensor_value"] ]) # Check that values returned from InfluxDB match what put in for outputs # Note that here the influx sensor names for the output tags are string-cast integers for key in output_keys: results_mask = name2_results["sensor_name"] == str(key[1]) assert np.allclose(df[key].values, name2_results[results_mask]["sensor_value"].values)
def test_machine_from_config(default_globals: dict): """ Test ability to create a Machine from a config element. """ element_str = """ name: ct-23-0001-machine data_provider: threads: 10 dataset: tags: [GRA-TE -23-0733.PV, GRA-TT -23-0719.PV, GRA-YE -23-0751X.PV] target_tag_list: [GRA-TE -123-456] train_start_date: 2018-01-01T09:00:30Z train_end_date: 2018-01-02T09:00:30Z model: sklearn.pipeline.Pipeline: steps: - sklearn.preprocessing.data.MinMaxScaler - gordo.machine.model.models.KerasAutoEncoder: kind: feedforward_hourglass evaluation: scoring_scaler: Null metadata: id: special-id """ element = get_dict_from_yaml(StringIO(element_str)) machine = Machine.from_config(element, project_name="test-project-name", config_globals=default_globals) logger.info(f"{machine}") assert isinstance(machine, Machine) assert len(machine.dataset.tag_list) == 3 # The metadata of machine should be json serializable json.dumps(machine.to_dict()["metadata"]) # The metadata of machine should be ast.literal_eval-able when cast as a str assert (ast.literal_eval(str( machine.to_dict()["metadata"])) == machine.to_dict()["metadata"]) # dictionary representation of the machine expected: expected = { "dataset": { "aggregation_methods": "mean", "asset": "global-asset", "data_provider": { "dl_service_auth_str": None, "interactive": False, "storename": "dataplatformdlsprod", "type": "DataLakeProvider", }, "default_asset": None, "n_samples_threshold": 0, "resolution": "10T", "row_filter": "", "row_filter_buffer_size": 0, "tag_list": [ "GRA-TE -23-0733.PV", "GRA-TT -23-0719.PV", "GRA-YE -23-0751X.PV", ], "target_tag_list": ["GRA-TE -123-456"], "train_end_date": "2018-01-02T09:00:30+00:00", "train_start_date": "2018-01-01T09:00:30+00:00", "type": "TimeSeriesDataset", }, "evaluation": { "cv_mode": "full_build", "metrics": [ "explained_variance_score", "r2_score", "mean_squared_error", "mean_absolute_error", ], "scoring_scaler": None, }, "metadata": { "build_metadata": { "model": { "cross_validation": { "cv_duration_sec": None, "scores": {}, "splits": {}, }, "model_builder_version": __version__, "model_creation_date": None, "model_meta": {}, "model_offset": 0, "model_training_duration_sec": None, }, "dataset": { "query_duration_sec": None, "dataset_meta": {} }, }, "user_defined": { "global-metadata": {}, "machine-metadata": { "id": "special-id" }, }, }, "model": { "sklearn.pipeline.Pipeline": { "steps": [ "sklearn.preprocessing.data.MinMaxScaler", { "gordo.machine.model.models.KerasAutoEncoder": { "kind": "feedforward_hourglass" } }, ] } }, "name": "ct-23-0001-machine", "project_name": "test-project-name", "runtime": { "reporters": [], "server": { "resources": { "limits": { "cpu": 4, "memory": 3 }, "requests": { "cpu": 2, "memory": 1 }, } }, }, } assert machine.to_dict() == expected
def test_influx_forwarder(influxdb, influxdb_uri, sensors, sensors_str): """ Test that the forwarder creates correct points from a multi-indexed series """ with patch.object(sensor_tag, "_asset_from_tag_name", return_value="default"): machine = Machine.from_config( config={ "name": "some-target-name", "dataset": { "tags": sensors_str, "target_tag_list": sensors_str, "train_start_date": "2016-01-01T00:00:00Z", "train_end_date": "2016-01-05T00:00:00Z", "resolution": "10T", }, "model": "sklearn.linear_model.LinearRegression", }, project_name="test-project", ) # Feature outs which match length of tags # These should then be re-mapped to the sensor tag names keys = [("name1", i) for i, _ in enumerate(sensors)] # Feature outs which don't match the length of the tags # These will be kept at 0..N as field names keys.extend([("name2", i) for i in range(len(sensors) * 2)]) # Assign all keys unique numbers columns = pd.MultiIndex.from_tuples(keys) index = pd.date_range("2019-01-01", "2019-01-02", periods=4) df = pd.DataFrame(columns=columns, index=index) # Generate some unique values for each key, and insert it into that column for i, key in enumerate(keys): df[key] = range(i, i + 4) # Create the forwarder and forward the 'predictions' to influx. forwarder = ForwardPredictionsIntoInflux( destination_influx_uri=influxdb_uri) forwarder.forward_predictions(predictions=df, machine=machine) # Client to manually verify the points written client = influx_client_from_uri(influxdb_uri, dataframe_client=True) name1_results = client.query("SELECT * FROM name1")["name1"] # Should have the tag names as column names since the shape matched assert all(c in name1_results.columns for c in ["machine"] + sensors_str) for i, tag in enumerate(sensors_str): assert np.allclose(df[("name1", i)].values, name1_results[tag].values) # Now check the other top level name "name2" is a measurement with the correct points written name2_results = client.query("SELECT * FROM name2")["name2"] # Should not have the same names as tags, since shape was 2x as long, should just be numeric columns assert all([ str(c) in name2_results.columns for c in ["machine"] + list(range(len(sensors) * 2)) ]) for key in filter(lambda k: k[0] == "name2", keys): assert np.allclose(df[key].values, name2_results[str(key[1])].values)
def build( machine_config: dict, output_dir: str, model_register_dir: click.Path, print_cv_scores: bool, model_parameter: List[Tuple[str, Any]], ): """ Build a model and deposit it into 'output_dir' given the appropriate config settings. \b Parameters ---------- machine_config: dict A dict loadable by :class:`gordo.machine.Machine.from_config` output_dir: str Directory to save model & metadata to. model_register_dir: path Path to a directory which will index existing models and their locations, used for re-using old models instead of rebuilding them. If omitted then always rebuild print_cv_scores: bool Print cross validation scores to stdout model_parameter: List[Tuple[str, Any] List of model key-values, wheres the values will be injected into the model config wherever there is a jinja variable with the key. """ if model_parameter and isinstance(machine_config["model"], str): parameters = dict(model_parameter) # convert lib of tuples to dict machine_config["model"] = expand_model(machine_config["model"], parameters) machine: Machine = Machine.from_config( machine_config, project_name=machine_config["project_name"]) logger.info(f"Building, output will be at: {output_dir}") logger.info(f"Register dir: {model_register_dir}") # Convert the config into a pipeline, and back into definition to ensure # all default parameters are part of the config. logger.debug(f"Ensuring the passed model config is fully expanded.") machine.model = serializer.into_definition( serializer.from_definition(machine.model)) logger.info(f"Fully expanded model config: {machine.model}") builder = ModelBuilder(machine=machine) try: _, machine_out = builder.build(output_dir, model_register_dir) # type: ignore logger.debug("Reporting built machine.") machine_out.report() logger.debug("Finished reporting.") if print_cv_scores: for score in get_all_score_strings(machine_out): print(score) except Exception as e: exit_code = EXCEPTION_TO_EXITCODE.get(e.__class__, 1) traceback.print_exc() sys.exit(exit_code) else: return 0