def test_store_big_run(self): """ Sometimes when the run has artifacts (inputs or outputs) their preview is pretty big (but it is limited to some size), when we moved to MySQL a run similar to the one this test is storing was failing to be read from the DB after insert on _pickle.UnpicklingError: pickle data was truncated So we fixed this by changing the BLOB fields to sqlalchemy.dialects.mysql.MEDIUMBLOB This test verifies it's working """ project_name = "runs-project" mlrun.new_project(project_name) uid = "some-uid" run_body_path = str(self.assets_path / "big-run.json") with open(run_body_path) as run_body_file: run_body = json.load(run_body_file) mlrun.get_run_db().store_run(run_body, uid, project_name) mlrun.get_run_db().read_run(uid, project_name)
def create_demo_project(self) -> mlrun.projects.MlrunProject: self._logger.debug("Creating sklearn project") demo_project = mlrun.new_project( self.project_name, str(self.assets_path), init_git=True ) self._logger.debug("Creating iris-generator function") function_path = str(self.assets_path / "iris_generator_function.py") iris_generator_function = mlrun.code_to_function( name="gen-iris", kind="job", filename=function_path, image="mlrun/mlrun", ) iris_generator_function.spec.remote = True iris_generator_function.spec.replicas = 1 iris_generator_function.spec.service_type = "NodePort" iris_generator_function.spec.build.commands.append( "pip install pandas sklearn pyarrow" ) self._logger.debug("Setting project functions") demo_project.set_function(iris_generator_function) demo_project.set_function("hub://describe", "describe") demo_project.set_function("hub://sklearn_classifier", "train") demo_project.set_function("hub://test_classifier", "test") demo_project.set_function("hub://model_server", "serving") demo_project.set_function("hub://model_server_tester", "live_tester") self._logger.debug("Setting project workflow") demo_project.set_workflow( "main", str(self.assets_path / "workflow.py"), embed=True ) return demo_project
def create_demo_project(self) -> mlrun.projects.MlrunProject: self._logger.debug("Creating horovod project") demo_project = mlrun.new_project(self.project_name, str(self.assets_path), init_git=True) mlrun.mount_v3io() self._logger.debug("Uploading training file") trainer_src_path = str(self.assets_path / "horovod_training.py") trainer_dest_path = pathlib.Path("/assets/horovod_training.py") stores = mlrun.datastore.store_manager.set() datastore, subpath = stores.get_or_create_store( self._get_v3io_user_store_path(trainer_dest_path)) datastore.upload(subpath, trainer_src_path) self._logger.debug("Creating iris-generator function") function_path = str(self.assets_path / "utils_functions.py") utils = mlrun.code_to_function( name="utils", kind="job", filename=function_path, image="mlrun/mlrun", ) utils.spec.remote = True utils.spec.replicas = 1 utils.spec.service_type = "NodePort" utils.spec.command = function_path self._logger.debug("Setting project functions") demo_project.set_function(utils) trainer = mlrun.new_function( name="trainer", kind="mpijob", command=self._get_v3io_user_store_path(trainer_dest_path, remote=False), image="mlrun/ml-models", ) trainer.spec.remote = True trainer.spec.replicas = 4 trainer.spec.service_type = "NodePort" demo_project.set_function(trainer) demo_project.set_function("hub://tf2_serving", "serving") demo_project.log_artifact( "images", target_path= "http://iguazio-sample-data.s3.amazonaws.com/catsndogs.zip", artifact_path=mlrun.mlconf.artifact_path, ) self._logger.debug("Setting project workflow") demo_project.set_workflow("main", str(self.assets_path / "workflow.py"), embed=True) return demo_project
def test_create_project(self): project_name = "some-project" project = mlrun.new_project(project_name) project.save_to_db() projects = mlrun.get_run_db().list_projects() assert len(projects) == 1 assert projects[0].metadata.name == project_name
def _create_project(self, project_name, with_repo=False): proj = mlrun.new_project(project_name, str(self.assets_path)) proj.set_function( "prep_data.py", "prep-data", image="mlrun/mlrun", handler="prep_data", with_repo=with_repo, ) proj.set_function("hub://describe") proj.set_function("hub://sklearn_classifier", "train") proj.set_function("hub://test_classifier", "test") proj.set_function("hub://v2_model_server", "serving") proj.set_artifact("data", Artifact(target_path=data_url)) proj.spec.params = {"label_column": "label"} arg = EntrypointParam( "model_pkg_class", type="str", default=model_pkg_class, doc="model package/algorithm", ) proj.set_workflow("main", "./kflow.py", args_schema=[arg]) proj.set_workflow("newflow", "./newflow.py", handler="newpipe") proj.save() return proj
def test_get_set_params(): project_name = "project-name" project = mlrun.new_project(project_name) param_key = "param-key" param_value = "param-value" project.params[param_key] = param_value assert param_value == project.get_param(param_key) default_value = "default-value" assert project.get_param("not-exist", default_value) == default_value
def test_vault_end_to_end(): # This requires an MLRun API server to run and work with Vault. This port should # be configured to allow access to the server. api_server_port = 57764 _set_vault_mlrun_configuration(api_server_port) project_name = "abc" func_name = "vault-function" aws_key_value = "1234567890" github_key_value = "proj1Key!!!" project = new_project(project_name) # This call will initialize Vault infrastructure and add the given secrets # It executes on the API server project.create_vault_secrets({ "aws_key": aws_key_value, "github_key": github_key_value }) # This API executes on the client side project_secrets = project.get_vault_secret_keys() assert project_secrets == ["aws_key", "github_key"], "secrets not created" # Create function and set container configuration function = code_to_function( name=func_name, filename="{}/vault_function.py".format(examples_path), handler="vault_func", project=project_name, kind="job", ) function.spec.image = "saarcoiguazio/mlrun:unstable" # Create context for the execution spec = new_task( project=project_name, name="vault_test_run", handler="vault_func", out_path=out_path, params={"secrets": ["password", "path", "github_key", "aws_key"]}, ) spec.with_secrets("vault", []) result = function.run(spec) verify_state(result) db = get_run_db().connect() state, log = db.get_log(result.metadata.uid, project=project_name) log = str(log) print(state) assert (log.find("value: {}".format(aws_key_value)) != -1), "secret value not detected in function output" assert (log.find("value: {}".format(github_key_value)) != -1), "secret value not detected in function output"
def test_zip_template(self): shutil.rmtree(project_dir, ignore_errors=True) project = mlrun.new_project("newproj2", project_dir, from_template=str(self.assets_path / "project.zip")) assert project.spec.description == "test", "failed to load yaml template" filepath = os.path.join(project_dir, "prep_data.py") assert os.path.isfile(filepath), "file not copied"
def test_git_template(self): shutil.rmtree(project_dir, ignore_errors=True) project = mlrun.new_project( "newproj3", project_dir, from_template="git://github.com/mlrun/project-demo.git", ) assert project.spec.description == "test", "failed to load yaml template" filepath = os.path.join(project_dir, "prep_data.py") assert os.path.isfile(filepath), "file not copied"
def test_sync_functions(): project_name = "project-name" project = mlrun.new_project(project_name) project.set_function("hub://describe") project_function_object = project.spec._function_objects project_file_path = pathlib.Path(tests.conftest.results) / "project.yaml" project.export(str(project_file_path)) imported_project = mlrun.load_project(None, str(project_file_path)) assert imported_project.spec._function_objects == {} imported_project.sync_functions() _assert_project_function_objects(imported_project, project_function_object)
def _create_project(self, project_name): proj = mlrun.new_project(project_name, f"{project_dir}/{project_name}") proj.set_function( str(f'{self.assets_path / "localpipe.py"}'), "tstfunc", image="mlrun/mlrun", # kind="job" ) proj.set_artifact("data", mlrun.artifacts.Artifact(target_path=data_url)) proj.spec.params = {"label_column": "label"} return proj
def test_user_project(): project_name = "project-name" original_username = os.environ.get("V3IO_USERNAME") usernames = ["valid-username", "require_Normalization"] for username in usernames: os.environ["V3IO_USERNAME"] = username project = mlrun.new_project(project_name, user_project=True) assert ( project.metadata.name == f"{project_name}-{inflection.dasherize(username.lower())}" ), "project name doesnt include user name" if original_username is not None: os.environ["V3IO_USERNAME"] = original_username
def create_demo_project(self) -> mlrun.projects.MlrunProject: self._logger.debug("Creating churn project") demo_project = mlrun.new_project(self.project_name, str(self.assets_path), init_git=True) data_url = ( "https://raw.githubusercontent.com/mlrun/demos/master/customer-churn-prediction/WA_Fn-UseC_-Telco-" "Customer-Churn.csv") demo_project.log_artifact("raw-data", target_path=data_url) self._logger.debug("Creating clean-data function") function_path = str(self.assets_path / "data_clean_function.py") clean_data_function = mlrun.code_to_function( name="clean_data", kind="job", filename=function_path, image="mlrun/ml-models-gpu" if self.use_gpus else "mlrun/ml-models", description="clean and encode raw data", categories=["data-prep"], labels={ "author": "yasha", "framework": "xgboost" }, ).apply(mlrun.mount_v3io()) clean_data_function.spec.remote = True clean_data_function.spec.replicas = 1 clean_data_function.spec.service_type = "NodePort" clean_data_function.spec.command = function_path self._logger.debug("Setting project functions") demo_project.set_function(clean_data_function) demo_project.set_function("hub://describe", "describe") demo_project.set_function("hub://xgb_trainer", "classify") demo_project.set_function("hub://xgb_test", "xgbtest") demo_project.set_function("hub://coxph_trainer", "survive") demo_project.set_function("hub://coxph_test", "coxtest") demo_project.set_function("hub://churn_server", "server") self._logger.debug("Setting project workflow") demo_project.set_workflow("main", str(self.assets_path / "workflow.py"), embed=True) return demo_project
def test_sync_functions(): project_name = "project-name" project = mlrun.new_project(project_name) project.set_function("hub://describe", "describe") project_function_object = project.spec._function_objects project_file_path = pathlib.Path(tests.conftest.results) / "project.yaml" project.export(str(project_file_path)) imported_project = mlrun.load_project("./", str(project_file_path)) assert imported_project.spec._function_objects == {} imported_project.sync_functions() _assert_project_function_objects(imported_project, project_function_object) fn = project.func("describe") assert fn.metadata.name == "describe", "func did not return" # test that functions can be fetched from the DB (w/o set_function) mlrun.import_function("hub://sklearn_classifier", new_name="train").save() fn = project.func("train") assert fn.metadata.name == "train", "train func did not return"
def test_list_artifact_tags(self): project_name = "some-project" project = mlrun.new_project(project_name) project.save_to_db() artifact_tags = mlrun.get_run_db().list_artifact_tags(project_name) assert artifact_tags == [] key = "some-key" data_frame = pandas.DataFrame({"x": [1, 2]}) artifact = mlrun.artifacts.dataset.DatasetArtifact(key, data_frame) uid = "some-uid" uid_2 = "some-uid-2" tag = "some-tag" tag_2 = "some-tag-2" mlrun.get_run_db().store_artifact( key, artifact.to_dict(), uid, tag=tag, project=project_name ) mlrun.get_run_db().store_artifact( key, artifact.to_dict(), uid_2, tag=tag_2, project=project_name ) artifact_tags = mlrun.get_run_db().list_artifact_tags(project_name) assert deepdiff.DeepDiff(artifact_tags, [tag, tag_2], ignore_order=True,) == {}
def test_model_update(): model = ModelArtifact("my-model", model_file="a.pkl") target_path = results_dir + "model/" project = mlrun.new_project("test-proj") artifact = project.log_artifact(model, upload=False, artifact_path=target_path) artifact_uri = f"store://artifacts/{artifact.project}/{artifact.db_key}" updated_model_spec = update_model( artifact_uri, parameters={"a": 1}, metrics={"b": 2}, inputs=[Feature(name="f1")], outputs=[Feature(name="f2")], feature_vector="vec", feature_weights=[1, 2], key_prefix="test-", labels={"lbl": "tst"}, write_spec_copy=False, ) print(updated_model_spec.to_yaml()) model_path, model, extra_dataitems = get_model(artifact_uri) # print(model_spec.to_yaml()) assert model_path.endswith( f"model/{model.model_file}"), "illegal model path" assert model.parameters == {"a": 1}, "wrong parameters" assert model.metrics == {"test-b": 2}, "wrong metrics" assert model.inputs[0].name == "f1", "wrong inputs" assert model.outputs[0].name == "f2", "wrong outputs" assert model.feature_vector == "vec", "wrong feature_vector" assert model.feature_weights == [1, 2], "wrong feature_weights" assert model.labels == {"lbl": "tst"}, "wrong labels"
def test_user_project(): project_name = "project-name" user = os.environ.get("V3IO_USERNAME") or getpass.getuser() project = mlrun.new_project(project_name, user_project=True) assert (project.metadata.name == f"{project_name}-{user}" ), "project name doesnt include user name"
def test_yaml_template(self): project = mlrun.new_project("newproj", "./", from_template=str(self.assets_path / "project.yaml")) assert project.spec.description == "test", "failed to load yaml template"
def test_load_project_from_db(self): project_name = "some-project" project = mlrun.new_project(project_name) project.save_to_db() mlrun.load_project(".", f"db://{project_name}")
import_function, mlconf, mount_v3io, new_function, new_project, run_local, wait_for_pipeline_completion, ) # Load environment variables load_dotenv() # Setup Project project_name = os.getenv("MLRUN_PROJECT_NAME") project_path = os.getenv("MLRUN_PROJECT_PATH") skproj = new_project(name=project_name, context=project_path) artifact_path = os.getenv("MLRUN_ARTIFACT_PATH") mlconf.dbpath = os.getenv("MLRUN_DBPATH") image = os.getenv("DOCKER_IMAGE") print(f"Project name: {project_name}") print(f"Artifacts path: {artifact_path}\nMLRun DB path: {mlconf.dbpath}") print("Docker Image:", image) # # Build Docker Image (only needs to be run once) # build_image = new_function(name="build-image", kind="job") # build_image.build_config( # image=f".mlrun/{image}", base_image="mlrun/mlrun", commands=["pip install pyhive"] # ) # build_image.deploy(with_mlrun=False)