def _include_files(self, comp_root, comp_desc): include_patterns = self._parse_patterns(comp_desc.get(json_fields.COMPONENT_DESC_INCLUDE_GLOB_PATTERNS)) exclude_patterns = self._parse_patterns(comp_desc.get(json_fields.COMPONENT_DESC_EXCLUDE_GLOB_PATTERNS)) # Add "requirements.txt" if includeGlobPattern is defined, # so requirements file will be always copied. if len(include_patterns): if os.path.exists(os.path.join(comp_root, MLCompConstants.REQUIREMENTS_FILENAME)): include_patterns.append(MLCompConstants.REQUIREMENTS_FILENAME) included_files = [] init_py_found = False for root, _, files in os.walk(comp_root): for f in files: rltv_path = os.path.relpath(root, comp_root) filepath = os.path.join(rltv_path, f) if rltv_path != "." else f if self._path_included(filepath, include_patterns, exclude_patterns): if filepath == "__init__.py": init_py_found = True # There can be several comp JSONs in one folder. # Don't include any of them, even related to current component, # it will be included automatically if ComponentsDesc._load_comp_desc(comp_root, f): continue included_files.append(filepath) if comp_desc[json_fields.COMPONENT_DESC_LANGUAGE_FIELD] == ComponentLanguage.PYTHON and not init_py_found: comp_name = comp_desc[json_fields.COMPONENT_DESC_NAME_FIELD] raise Exception("Missing '__init__.py' in component's root folder or it is not included" " by 'glob' pattern! Please make sure to add it! name: {}, path: {}" .format(comp_name, comp_root)) return included_files
def scan_dir(self, root_dir): """ Scanning a directory returning a map of components: { "name1": { "directory": path_relative_to_root_dir } } :return: """ comps = {} logging.debug("Scanning {}".format(root_dir)) for root, comp_desc, comp_filename in ComponentsDesc.next_comp_desc(root_dir): engine_type = comp_desc[json_fields.COMPONENT_DESC_ENGINE_TYPE_FIELD] comps.setdefault(engine_type, {}) comp_name = comp_desc[json_fields.COMPONENT_DESC_NAME_FIELD] if comp_name in comps[engine_type]: raise Exception("Component already defined!\n\tPrev comp file: {}\n\tCurr comp file: {}" .format( os.path.join(comps[engine_type][comp_name]["root"], comps[engine_type][comp_name]["comp_filename"]), os.path.join(root, comp_filename)) ) comps[engine_type][comp_name] = {} comps[engine_type][comp_name]["comp_desc"] = comp_desc comps[engine_type][comp_name]["root"] = root comps[engine_type][comp_name]["files"] = self._include_files(root, comp_desc) # Always include current component json file regardless of its name. comps[engine_type][comp_name]["files"].append(comp_filename) comps[engine_type][comp_name]["comp_filename"] = comp_filename logging.debug("Found component, root: {}, engine: {}, name: ".format(root, engine_type, comp_name)) return comps
def test_python_stand_alone_argument_building(self): systemConfig = { "statsDBHost": "localhost", "statsDBPort": 8899, "statsMeasurementID": "tf-job-0001", "mlObjectSocketHost": "localhost", "mlObjectSocketSourcePort": 9900, "mlObjectSocketSinkPort": 9901, "modelFileSinkPath": "output-model-1234", "modelFileSourcePath": "input-model-1234", "healthStatFilePath": "/tmp/health", "workflowInstanceId": "/tmp/run/filesink1", "socketSourcePort": 0, "socketSinkPort": 0, "enableHealth": True, "canaryThreshold": 0.0 } pipeline = { "name": "stand_alone_test", "engineType": "Generic", "pipe": [{ "name": "Test Train", "id": 1, "type": "test-python-train", "parents": [], "arguments": { "arg1": "arg1-value" } }] } python_engine = PythonEngine("test-pipe") comps_desc_list = ComponentsDesc(python_engine, pipeline=pipeline).load() dag = Dag(pipeline, comps_desc_list, python_engine) dag_node = dag.get_dag_node(0) input_args = dag_node.input_arguments(systemConfig, comp_only_args=True) assert input_args["arg1"] == "arg1-value" assert input_args["output-model"] == "output-model-1234"
def test_dag_detect_is_stand_alone(self): pipeline = { "name": "stand_alone_test", "engineType": "Generic", "pipe": [{ "name": "Hello", "id": 1, "type": "hello-world", "parents": [], "arguments": { "arg1": "arg1-value" } }] } python_engine = PythonEngine("test-pipe") comps_desc_list = ComponentsDesc(python_engine, pipeline=pipeline).load() dag = Dag(pipeline, comps_desc_list, python_engine) assert dag.is_stand_alone is True
def test_correct_python_component_io(self): pipeline = { "name": "stand_alone_test", "engineType": "Generic", "pipe": [{ "name": "Test Train 1", "id": 1, "type": "test-python-train", "parents": [], "arguments": { "arg1": "arg1-value" } }, { "name": "Test Train 2", "id": 2, "type": "test-python-train", "parents": [{ "parent": 1, "output": 1, "input": 1 }, { "parent": 1, "output": 0, "input": 0 }], "arguments": { "arg1": "arg1-value" } }, { "name": "Test Train 3", "id": 3, "type": "test-python-train", "parents": [{ "parent": 2, "output": 0, "input": 0 }, { "parent": 2, "output": 2, "input": 2 }, { "parent": 2, "output": 1, "input": 1 }], "arguments": { "arg1": "arg1-value" } }, { "name": "Test Train 4", "id": 4, "type": "test-python-train", "parents": [{ "parent": 3, "output": 0, "input": 1 }, { "parent": 3, "output": 1, "input": 0 }], "arguments": { "arg1": "arg1-value" } }] } python_engine = PythonEngine("test-pipe") comps_desc_list = ComponentsDesc(python_engine, pipeline=pipeline).load() dag = Dag(pipeline, comps_desc_list, python_engine) dag_node_1 = dag.get_dag_node(0) dag_node_2 = dag.get_dag_node(1) dag_node_3 = dag.get_dag_node(2) dag_node_4 = dag.get_dag_node(3) # A100 means -- Type A, Node Id 1, Output 0, Goes To 0 # pipeline is as follow # OUTPUT INDEX 0 - INPUT INDEX 0 OUTPUT INDEX 0 - INPUT INDEX 0 OUTPUT INDEX 0 INPUT INDEX 0 # / \ / \ / \ / \ # ID 1 ID 2-OUTPUT INDEX 1 - INPUT INDEX 1-ID 3 /\ ID 4 # \ / \ / \ / \ / # OUTPUT INDEX 1 - INPUT INDEX 1 OUTPUT INDEX 2 - INPUT INDEX 2 OUTPUT INDEX 1 INPUT INDEX 1 dag.update_parent_data_objs(dag_node_1, ["A100", "B111"]) dag.update_parent_data_objs(dag_node_2, ["A200", "B211", "C222"]) dag.update_parent_data_objs(dag_node_3, ["A301", "B310"]) # as node 1 does not have any parents, input object should be empty assert dag.parent_data_objs(dag_node_1) == [] # as node 2 have input coming but json is not correctly order, but still output should be correctly indexed assert dag.parent_data_objs(dag_node_2) == ["A100", "B111"] # little complicated node 3 inputs. but same story as above assert dag.parent_data_objs(dag_node_3) == ["A200", "B211", "C222"] # node 4 gets output of node3's index 0 to its 1st input index and node3's output index 1 to its 0th input indexx assert dag.parent_data_objs(dag_node_4) == ["B310", "A301"]
def test_component_argument_building_with_sagemaker(self): systemConfig = { "statsDBHost": "localhost", "statsDBPort": 8899, "statsMeasurementID": "tf-job-0001", "mlObjectSocketHost": "localhost", "mlObjectSocketSourcePort": 9900, "mlObjectSocketSinkPort": 9901, "modelFileSinkPath": "output-model-1234", "modelFileSourcePath": "input-model-1234", "healthStatFilePath": "/tmp/health", "workflowInstanceId": "/tmp/run/filesink1", "socketSourcePort": 0, "socketSinkPort": 0, "enableHealth": True, "canaryThreshold": 0.0 } region = "us-west-2" iam_role_value = "arn:aws:iam::ACCOUNT-ID-WITHOUT-HYPHENS:role/Get-pics" ee_config = { "configs": { "engConfig": { "type": "sagemaker", "arguments": { "region": { "value": "us-west-2", "type": "string", "optional": "false", "label": "Region", "description": "The AWS Region to send the request to", "editable": "true" }, "aws_access_key_id": { "value": "2134", "type": "string", "optional": "false", "label": "Access Key ID", "description": "A long term credential access key ID", "editable": "true" }, "aws_secret_access_key": { "value": "123qwe", "type": "string", "optional": "false", "label": "Secret Access Key", "description": "A long term credential secret access key", "editable": "true" }, "iam_role": { "value": iam_role_value, "type": "string", "optional": "false", "label": "Region", "description": "The AWS Region to send the request to", "editable": "true" } } } } } pipeline = { "name": "SageMaker pipeline", "engineType": "SageMaker", "systemConfig": systemConfig, "executionEnvironment": ee_config, "pipe": [ { "name": "String Source", "id": 1, "type": "string-source", "parents": [], "arguments": { "arg1": "arg1-value" } } ] } python_engine = SageMakerEngine(pipeline) comps_desc_list = ComponentsDesc(python_engine, pipeline=pipeline).load() dag = Dag(pipeline, comps_desc_list, python_engine) dag_node = dag.get_dag_node(0) input_args = dag_node.input_arguments(systemConfig, ee_config, comp_only_args=False) assert input_args["arg1"] == "arg1-value" assert input_args["configs"]["engConfig"]["arguments"]["iam_role"]["value"] == iam_role_value