def test_basic_generation(path_to_config_files): """ Model must be included in the config file start/end dates ...always included? or default to specific dates if not included? """ project_name = "some-fancy-project-name" model_config = '{"sklearn.pipeline.Pipeline": {"steps": ["sklearn.preprocessing.data.MinMaxScaler", {"gordo.machine.model.models.KerasAutoEncoder": {"kind": "feedforward_hourglass"}}]}}' config_filename = "config-test-with-models.yml" expanded_template = _generate_test_workflow_str( path_to_config_files, config_filename, project_name=project_name ) assert ( project_name in expanded_template ), f"Expected to find project name: {project_name} in output: {expanded_template}" assert ( model_config in expanded_template ), f"Expected to find model config: {model_config} in output: {expanded_template}" yaml_content = wg.get_dict_from_yaml( os.path.join(path_to_config_files, config_filename) ) with patch.object(sensor_tag, "_asset_from_tag_name", return_value="default"): machines = NormalizedConfig(yaml_content, project_name=project_name).machines assert len(machines) == 2
def local_build( config_str: str, ) -> Iterable[Tuple[Union[BaseEstimator, None], Machine]]: """ Build model(s) from a bare Gordo config file locally. This is very similar to the same steps as the normal workflow generation and subsequent Gordo deployment process makes. Should help developing locally, as well as giving a good indication that your config is valid for deployment with Gordo. Parameters ---------- config_str: str The raw yaml config file in string format. Examples -------- >>> import numpy as np >>> from gordo.dependencies import configure_once >>> configure_once() >>> config = ''' ... machines: ... - dataset: ... tags: ... - SOME-TAG1 ... - SOME-TAG2 ... target_tag_list: ... - SOME-TAG3 ... - SOME-TAG4 ... train_end_date: '2019-03-01T00:00:00+00:00' ... train_start_date: '2019-01-01T00:00:00+00:00' ... asset: asgb ... data_provider: ... type: RandomDataProvider ... metadata: ... information: Some sweet information about the model ... model: ... gordo.machine.model.anomaly.diff.DiffBasedAnomalyDetector: ... base_estimator: ... sklearn.pipeline.Pipeline: ... steps: ... - sklearn.decomposition.PCA ... - sklearn.multioutput.MultiOutputRegressor: ... estimator: sklearn.linear_model.LinearRegression ... name: crazy-sweet-name ... ''' >>> models_n_metadata = local_build(config) >>> assert len(list(models_n_metadata)) == 1 Returns ------- Iterable[Tuple[Union[BaseEstimator, None], Machine]] A generator yielding tuples of models and their metadata. """ config = get_dict_from_yaml(io.StringIO(config_str)) normed = NormalizedConfig(config, project_name="local-build") for machine in normed.machines: yield ModelBuilder(machine=machine).build()
def test_splited_docker_images(): config = {"machines": [], "globals": {"runtime": {}}} normalized_config = NormalizedConfig(config, "test", "1.0.0") config_globals = normalized_config.globals config_runtime = config_globals["runtime"] assert config_runtime["deployer"]["image"] == "gordo-deploy" assert config_runtime["server"]["image"] == "gordo-model-server" assert config_runtime["prometheus_metrics_server"]["image"] == "gordo-model-server" assert config_runtime["builder"]["image"] == "gordo-model-builder" assert config_runtime["client"]["image"] == "gordo-client"
def unique_tag_list_cli(machine_config: str, output_file_tag_list: str): yaml_content = wg.get_dict_from_yaml(machine_config) machines = NormalizedConfig(yaml_content, project_name="test-proj-name").machines tag_list = set(tag for machine in machines for tag in machine.dataset.tag_list) if output_file_tag_list: with open(output_file_tag_list, "w") as output_file: for tag in tag_list: output_file.write(f"{tag.name}\n") else: for tag in tag_list: print(tag.name)
def workflow_generator_cli(gordo_ctx, **ctx): """ Machine Configuration to Argo Workflow """ context: Dict[Any, Any] = ctx.copy() yaml_content = wg.get_dict_from_yaml(context["machine_config"]) try: log_level = yaml_content["globals"]["runtime"]["log_level"] except KeyError: log_level = os.getenv("GORDO_LOG_LEVEL", gordo_ctx.obj["log_level"]) logging.getLogger("gordo").setLevel(log_level.upper()) context["log_level"] = log_level.upper() # Create normalized config config = NormalizedConfig(yaml_content, project_name=context["project_name"]) context["max_server_replicas"] = (context.pop("n_servers") or len(config.machines) * 10) # We know these exist since we set them in the default globals builder_resources = config.globals["runtime"]["builder"]["resources"] context["model_builder_resources_requests_memory"] = builder_resources[ "requests"]["memory"] context["model_builder_resources_requests_cpu"] = builder_resources[ "requests"]["cpu"] context["model_builder_resources_limits_memory"] = builder_resources[ "limits"]["memory"] context["model_builder_resources_limits_cpu"] = builder_resources[ "limits"]["cpu"] context["server_resources"] = config.globals["runtime"]["server"][ "resources"] # These are also set in the default globals, and guaranteed to exist client_resources = config.globals["runtime"]["client"]["resources"] context["client_resources_requests_memory"] = client_resources["requests"][ "memory"] context["client_resources_requests_cpu"] = client_resources["requests"][ "cpu"] context["client_resources_limits_memory"] = client_resources["limits"][ "memory"] context["client_resources_limits_cpu"] = client_resources["limits"]["cpu"] context["client_max_instances"] = config.globals["runtime"]["client"][ "max_instances"] influx_resources = config.globals["runtime"]["influx"]["resources"] context["influx_resources_requests_memory"] = influx_resources["requests"][ "memory"] context["influx_resources_requests_cpu"] = influx_resources["requests"][ "cpu"] context["influx_resources_limits_memory"] = influx_resources["limits"][ "memory"] context["influx_resources_limits_cpu"] = influx_resources["limits"]["cpu"] nr_of_models_with_clients = len([ machine for machine in config.machines if machine.runtime.get("influx", {}).get("enable", True) ]) context["client_total_instances"] = nr_of_models_with_clients # Should we start up influx/grafana at all, i.e. is there at least one request # for it?" enable_influx = nr_of_models_with_clients > 0 context["enable_influx"] = enable_influx context["postgres_host"] = f"gordo-postgres-{config.project_name}" # If enabling influx, we setup a postgres reporter to send metadata # to allowing querying about the machine from grafana if enable_influx: pg_reporter = { "gordo.reporters.postgres.PostgresReporter": { "host": context["postgres_host"] } } for machine in config.machines: machine.runtime["reporters"].append(pg_reporter) # Determine if MlFlowReporter should be enabled per machine for machine in config.machines: try: enabled = machine.runtime["builder"]["remote_logging"]["enable"] except KeyError: continue else: if enabled: machine.runtime["reporters"].append( "gordo.reporters.mlflow.MlFlowReporter") context["machines"] = config.machines # Context requiring pre-processing context["target_names"] = [machine.name for machine in config.machines] # Json dump owner_references, if not None, otherwise pop it out of the context if context["owner_references"]: context["owner_references"] = json.dumps(context["owner_references"]) else: context.pop("owner_references") builder_exceptions_report_level = get_builder_exceptions_report_level( config) context[ "builder_exceptions_report_level"] = builder_exceptions_report_level.name if builder_exceptions_report_level != ReportLevel.EXIT_CODE: context["builder_exceptions_report_file"] = "/tmp/exception.json" if context["workflow_template"]: template = wg.load_workflow_template(context["workflow_template"]) else: workflow_template = pkg_resources.resource_filename( "gordo.workflow.workflow_generator.resources", "argo-workflow.yml.template") template = wg.load_workflow_template(workflow_template) # Clear output file if context["output_file"]: open(context["output_file"], "w").close() # type: ignore for i in range(0, len(config.machines), context["split_workflows"]): # type: ignore logger.info( f"Generating workflow for machines {i} to {i + context['split_workflows']}" ) context["machines"] = config.machines[i:i + context["split_workflows"]] if context["output_file"]: s = template.stream(**context) with open(context["output_file"], "a") as f: # type: ignore if i != 0: f.write("\n---\n") s.dump(f) else: output = template.render(**context) if i != 0: print("\n---\n") print(output)