def test_package(self): step_name = "train" expected_results_schema_type = "package_results" # MUST BE A LOADED SCHEMA expected_results_schema_version = "9999.0.1" # MUST BE A SEMVER step_execution_object = StepExecution(self.input_parameters, self.execution_parameters) results_ml_object = MLObject() results_ml_object.set_type( schema_type=expected_results_schema_type, schema_version=expected_results_schema_version, ) # Should error due to missing fields with self.assertRaises(ValueError) as context: verify_result_contract( results_ml_object, expected_results_schema_type, expected_results_schema_version, step_name, ) self.assertTrue( f"Error verifying result object for '{step_name}.output'" in str( context.exception)) result_ml_object_schema_type = expected_results_schema_type result_ml_object_schema_version = expected_results_schema_version exec( (Path("tests") / "sample_package_execution.py").read_text(), globals(), locals(), ) results_ml_object.run_date = datetime.datetime.now() results_ml_object.step_id = uuid.uuid4() results_ml_object.run_id = uuid.uuid4() results_ml_object.execution_profile.system_memory_utilization = random( ) results_ml_object.execution_profile.network_traffic_in_bytes = randint( 7e9, 9e10) results_ml_object.execution_profile.gpu_temperature = randint(70, 130) results_ml_object.execution_profile.disk_io_utilization = random() results_ml_object.execution_profile.gpu_percent_of_time_accessing_memory = ( random()) results_ml_object.execution_profile.cpu_utilization = random() results_ml_object.execution_profile.gpu_utilization = random() results_ml_object.execution_profile.gpu_memory_allocation = random() self.assertTrue( verify_result_contract( results_ml_object, expected_results_schema_type, expected_results_schema_version, step_name, ))
def test_e2e(self): MLSchema.populate_registry() MLSchema.append_schema_to_registry(Path.cwd() / ".parameters" / "schemas") # Execute step input_parameters = { # Put sample required input parameters here } execution_parameters = { # Put sample required execution parameters here } # THESE SHOULD BE THE ONLY SETTINGS FOR THIS FILE step_name = "process_data" expected_results_schema_type = "data_result" # MUST BE A LOADED SCHEMA expected_results_schema_version = "0.0.1" # MUST BE A SEMVER step_execution_object = StepExecution(input_parameters, execution_parameters) results_object = MLObject() results_object.set_type( schema_type=expected_results_schema_type, schema_version=expected_results_schema_version, ) # Should error due to missing fields with self.assertRaises(ValueError) as context: verify_result_contract( results_object, expected_results_schema_type, expected_results_schema_version, step_name, ) self.assertTrue( f"Error verifying result object for '{step_name}.output'" in str( context.exception)) results_object = step_execution_object.execute( result_object_schema_type=expected_results_schema_type, result_object_schema_version=expected_results_schema_version, ) results_object.run_date = datetime.datetime.now() results_object.step_id = uuid.uuid4() results_object.run_id = uuid.uuid4() self.assertTrue( verify_result_contract(results_object, expected_results_schema_type, expected_results_schema_version, step_name))
def test_process_data(self): """ Full E2E of Process Data """ # THESE SHOULD BE THE ONLY SETTINGS FOR THIS FILE step_name = "process_data" expected_results_schema_type = "data_result" # MUST BE A LOADED SCHEMA expected_results_schema_version = "9999.0.1" # MUST BE A SEMVER results_ml_object = MLObject() results_ml_object.set_type( schema_type=expected_results_schema_type, schema_version=expected_results_schema_version, ) # Should error due to missing fields with self.assertRaises(ValueError) as context: verify_result_contract( results_ml_object, expected_results_schema_type, expected_results_schema_version, step_name, ) self.assertTrue( f"Error verifying result object for '{step_name}.output'" in str( context.exception)) results_ml_object = MLObject() result_ml_object_schema_type = expected_results_schema_type result_ml_object_schema_version = expected_results_schema_version exec( (Path("tests") / "sample_process_data_execution.py").read_text(), globals(), locals(), ) results_ml_object.run_date = datetime.datetime.now() results_ml_object.step_id = str(uuid.uuid4()) results_ml_object.run_id = str(uuid.uuid4()) results_ml_object.execution_profile.system_memory_utilization = random( ) results_ml_object.execution_profile.network_traffic_in_bytes = randint( 7e9, 9e10) results_ml_object.execution_profile.gpu_temperature = randint(70, 130) results_ml_object.execution_profile.disk_io_utilization = random() results_ml_object.execution_profile.gpu_percent_of_time_accessing_memory = ( random()) results_ml_object.execution_profile.cpu_utilization = random() results_ml_object.execution_profile.gpu_utilization = random() results_ml_object.execution_profile.gpu_memory_allocation = random() self.assertTrue( verify_result_contract( results_ml_object, expected_results_schema_type, expected_results_schema_version, step_name, ))
def sub_main(): rootLogger = setupLogger().get_root_logger() # Loading input values msg = "::debug::Loading input values" print_left_message("Loading variables from environment...") rootLogger.debug(msg) parameters = convert_environment_variables_to_dict() print("{:>15}".format("ok")) # Finished loading from environment parameters.INPUT_SCHEMAS_DIRECTORY = os.environ.get("INPUT_SCHEMAS_DIRECTORY") if "INPUT_SCHEMAS_GIT_URL" in os.environ and os.environ.get != "": parameters.INPUT_SCHEMAS_GIT_URL = os.environ.get("INPUT_SCHEMAS_GIT_URL") print_left_message( f"Downloading schemas from {parameters.INPUT_SCHEMAS_GIT_URL}..." ) try: git.Git(parameters.INPUT_SCHEMAS_DIRECTORY).clone( parameters.INPUT_SCHEMAS_GIT_URL, str(uuid.uuid4()), depth=1 ) # TODO: Authenticate with GH Token? print("{:>15}".format("ok")) # Finished loading from GIT URL except GitCommandError as gce: raise KnownException( f"Trying to read from the git repo ({parameters.INPUT_SCHEMAS_GIT_URL}) and write to the directory ({parameters.INPUT_SCHEMAS_DIRECTORY}). Full error follows: {str(gce)}" ) print_left_message("Appending schemas to registry...") MLSchema.append_schema_to_registry(Path(parameters.INPUT_SCHEMAS_DIRECTORY)) print("{:>15}".format("ok")) # Finished loading registry parameters.previous_step_name = os.environ.get("INPUT_PREVIOUS_STEP_NAME", "") parameters.next_step_name = os.environ.get("INPUT_NEXT_STEP_NAME", "") rootLogger.debug("::debug:: Finished main") # Load metastore credentials rootLogger.debug("::debug:: Loading credentials") print_left_message("Loading and validating metastore credentials...") metastore_cred_string_blob = os.environ.get("INPUT_METASTORE_CREDENTIALS") metastore_credentials_packed = YAML.safe_load(metastore_cred_string_blob) metastore_credentials_string = base64.urlsafe_b64decode( metastore_credentials_packed ).decode("utf-8") metastore_credentials = YAML.safe_load(metastore_credentials_string) report_found_params( ["url", "key", "database_name", "container_name"], metastore_credentials ) print("{:>15}".format("ok")) # Finished loading and validating metastore rootLogger.debug("::debug::Starting metastore connection") print_left_message("Starting connection to metastore...") ms = load_metastore_connection(metastore_credentials_packed) print("{:>15}".format("ok")) # Finished connecting to metastore workflow_node_id = os.environ.get("INPUT_WORKFLOW_NODE_ID") if workflow_node_id == "": raise KnownException( "INPUT_WORKFLOW_NODE_ID - No workflow node id was provided." ) print_left_message(f"Loading workflow object ID: '{workflow_node_id}' ...") workflow_object = load_workflow_object(workflow_node_id, ms) print("{:>15}".format("ok")) # Finished loading workload abject rootLogger.debug("::debug::Loading input parameters") print_left_message("Loading input parameters ...") input_parameters = load_parameters("INPUT", ms) print("{:>15}".format("ok")) # Finished loading input parameters from metastore rootLogger.debug("::debug::Loading execution parameters file") print_left_message("Loading execution parameters ...") execution_parameters = load_parameters("EXECUTION", ms) print( "{:>15}".format("ok") ) # Finished loading execution parameters from metastore step_name = parameters.INPUT_STEP_NAME print_left_message(f"Loading contract for '{step_name}.input' ...") input_object = load_contract_object( parameters=input_parameters, workflow_object=workflow_object, step_name=step_name, contract_type="input", ) print( "{:>15}".format("ok") ) # Finished loading execution parameters from metastore print(f"Attaching step info to input for '{step_name}.input' ... ") input_node_id = ms.attach_step_info( input_object, workflow_object.schema_version, workflow_node_id, step_name, "input", ) print(f" Input Node ID: {input_node_id}") # Finished attaching step ID to input rootLogger.debug(f"Successfully saved: {input_object}") # TODO don't hard code any of these exec_dict = execution_parameters exec_dict["run_id"] = parameters.GITHUB_RUN_ID exec_dict["run_date"] = datetime.datetime.now() exec_dict["step_id"] = str(uuid.uuid4()) print_left_message(f"Loading contract for '{step_name}.execution' ...") execution_object = load_contract_object( parameters=exec_dict, workflow_object=workflow_object, step_name=step_name, contract_type="execution", ) print( "{:>15}".format("ok") ) # Finished loading execution parameters from metastore rootLogger.debug(f"Successfully loaded and validated execution: {execution_object}") print(f"Attaching step info to input for '{step_name}.execution' ... ") execution_node_id = ms.attach_step_info( execution_object, workflow_object.schema_version, workflow_node_id, step_name, "execution", ) rootLogger.debug(f"Successfully saved: {execution_object}") print( f" Execution Node ID: {execution_node_id}" ) # Finished attaching step ID to input # Branching between use step_execution.py or execution file. execution_file = os.environ.get("INPUT_EXECUTION_FILE") print_left_message("Executing step ... ") print("{:>15}".format("ok")) # Starting executing step results_ml_object = execute_step( execution_file, workflow_object, input_object, execution_object, step_name, parameters.GITHUB_RUN_ID, ) print_left_message("Finished executing step ... ") print("{:>15}".format("ok")) # Starting executing step # TODO: Need to add next and previous steps to attach_step_info print(f"Attaching step info to output for '{step_name}.output' ... ") output_node_id = ms.attach_step_info( results_ml_object, workflow_object.schema_version, workflow_node_id, step_name, "output", ) print( f" Output Node ID: {output_node_id}" ) # Finished attaching step ID to output dict_conversion = results_ml_object.dict_without_internal_variables() string_io_handle = StringIO() YAML.SafeDumper.add_representer(uuid.UUID, repr_uuid) YAML.safe_dump(dict_conversion, string_io_handle) yaml_conversion = string_io_handle.getvalue() encode_to_utf8_bytes = yaml_conversion.encode("utf-8") base64_encode = base64.urlsafe_b64encode(encode_to_utf8_bytes) final_encode_to_utf8 = str(base64_encode, "utf-8") # Recording raw log info # logBuffer.flush() # log_contents = logBuffer.getvalue() log_object = MLObject() log_object.set_type(schema_version="0.1.0", schema_type="log") log_object.run_id = parameters.GITHUB_RUN_ID log_object.step_name = step_name log_object.run_date = datetime.datetime.now() log_object.raw_log = ( "NO RAW LOGS YET (NEED TO FIGURE OUT WHERE I CAN PUSH A LARGE OBJECT)" ) # log_object.raw_log = log_contents log_object.log_property_bag = {} # errors = log_object.validate() log_node_id = ms.attach_step_info( log_object, workflow_object.schema_version, workflow_node_id, step_name, "log" ) rootLogger.debug( f"::set-output name=output_raw::{results_ml_object.dict_without_internal_variables()}" ) print("Printing output ... \n \n") logger = setupLogger() output_message = "" output_message += f"{logger.print_and_log('output_raw', results_ml_object.dict_without_internal_variables())}\n" output_message += ( f"{logger.print_and_log('output_base64_encoded', final_encode_to_utf8)}\n" ) output_message += f"{logger.print_and_log('input_node_id', input_node_id)}\n" output_message += ( f"{logger.print_and_log('execution_node_id', execution_node_id)}\n" ) output_message += f"{logger.print_and_log('output_node_id', output_node_id)}\n" output_message += f"{logger.print_and_log('log_node_id', log_node_id)}\n" rootLogger.debug(f"Complete output: \n {output_message}") print("\n\n... finished printing output") # Finished printing output print_left_message("Generating /output_message.txt ...") if is_docker(): Path("/output_message.txt").write_text(output_message) else: fp = tempfile.TemporaryFile() fp.write(output_message.encode("utf-8")) print("{:>15}".format("ok")) # Finished printing output
def execute_step( execution_file: str, workflow_object: MLObject, input_object: MLObject, execution_object: MLObject, step_name, run_id, ): rootLogger = setupLogger().get_root_logger() results_ml_object = MLObject() if execution_file is None: msg = "Did not find any value for INPUT_EXECUTION_FILE, using /src/step_execution.py" print_left_message(msg) rootLogger.debug("::debug::" + msg) print("{:>15}".format("ok")) # Finished loading from environment step_execution_object = StepExecution(input_object, execution_object) results_ml_object = step_execution_object.execute( result_object_schema_type=workflow_object.steps[ step_name ].output.schema_type, result_object_schema_version=workflow_object.steps[ step_name ].output.schema_version, ) else: # TODO: Critical error if variable set but file not found msg = f"Executing '${execution_file}' (found in INPUT_EXECUTION_FILE env var)" print_left_message(msg) rootLogger.debug("::debug::" + msg) execution_file_path = Path(execution_file) if execution_file_path.exists() is False: raise KnownException( f"'{execution_file}' was provided as the file, but it does not appear to exist at {str(execution_file_path.resolve())} -- exiting." ) # The below are used in the execution file result_ml_object_schema_type = workflow_object.steps[ # noqa step_name ].output.schema_type result_ml_object_schema_version = workflow_object.steps[ # noqa step_name ].output.schema_version exec(execution_file_path.read_text(), globals(), locals()) print("{:>15}".format("ok")) # Finished executing step if (results_ml_object is None) or (len(results_ml_object) == 0): raise KnownException( "No value was assigned to the variable 'results_ml_object' -- exiting." ) elif isinstance(results_ml_object, MLObject) is False: raise KnownException( "The variable 'results_ml_object' was not of type MLObject -- exiting." ) results_ml_object.run_id = run_id results_ml_object.step_id = str(uuid.uuid4()) results_ml_object.run_date = datetime.datetime.now().isoformat() # Using the below to validate the object, even though we already have it created. load_contract_object( parameters=results_ml_object.dict_without_internal_variables(), workflow_object=workflow_object, step_name=step_name, contract_type="output", ) return results_ml_object
def main(self): c = """ - Dashboard for runs -- Size -- Likelihood of bias -- Time for run -- Accuracy - Filter by version - Look up at top version and show metadata going in and out - Show bad input (e.g. it's null) and what happens when you run it - Show when you add a new step - how you can compare those with other versions """ # noqa credentials = Credentials.metastore_credentials_prod MLSchema.append_schema_to_registry(Path(".parameters") / "schemas") repo_name = "mlspec" output_regex = "::set-output name=output_base64_encoded::(.*?)\\\\" run_date_start = datetime.datetime(2020, 1, 1) + datetime.timedelta( seconds=random.randrange(0, 5184000)) run_id = str(uuid.uuid4()) step_name = "process_data" data_source = MLObject() data_source.set_type("500.0.1", "data_source") data_source.run_id = run_id data_source.step_id = str(uuid.uuid4()) data_source.run_date = str(run_date_start.isoformat()) data_source.source_id = str(uuid.uuid4()) data_source.source_uri = f"https://internal.contoso.com/datasets/raw_nlp_data-{run_date_start.strftime('%Y-%m-%d')}-{get_random_md5()}" # noqa data_source.extended_properties = {} data_process_run = MLObject() data_process_run.set_type("500.0.1", "data_process_run") data_process_run.nodes = random.randrange(1, 4) * 2 data_process_run.cpu_per_node = f"{random.randrange(2,8) * 2}" data_process_run.ram_per_node = f"{random.randrange(1,16) * 8}Gi" data_process_run.gpu_required = (random.randrange(1, 2) % 2) == 0 data_process_run.output_root_path = ( "https://internal.contoso.com/datasets/processed_data/") data_process_run.base_image = random_base_image() data_process_run.machine_type = random_machine_type() data_process_run.run_id = run_id data_process_run.step_id = str(uuid.uuid4()) data_process_run.run_date = str(run_date_start.isoformat()) data_process_run.extended_properties = {} environment_dict = YAML.safe_load(f""" INPUT_schemas_directory: '.parameters/schemas' INPUT_schemas_git_url: 'https://github.com/mlspec/mlspeclib-action-samples-schemas.git' INPUT_workflow_node_id: 'workflow|500.0.1|31ca83ed-8263-4c8c-8672-7a2163a34725' INPUT_step_name: {step_name} INPUT_input_parameters_raw: {data_source.dict_without_internal_variables()} INPUT_execution_parameters_raw: {data_process_run.dict_without_internal_variables()} INPUT_METASTORE_CREDENTIALS: {credentials} GITHUB_RUN_ID: {str(run_id)} GITHUB_WORKSPACE: '/src' """) self.run_container(repo_name, "mlspeclib-action-samples-process-data", environment_dict) buff_val = self.buffer.getvalue() m = re.search(output_regex, buff_val) process_data_encoded_val = m.group(1) # Below is for debugging, we're ok leaving it in base64 encoded # process_data_output_value = base64.urlsafe_b64decode(process_data_encoded_val) self.buffer.truncate(0) self.buffer.seek(0) step_name = "train" training_run = MLObject() training_run.set_type("500.0.1", "training_run") training_run.nodes = random.randrange(1, 4) * 2 training_run.cpu_per_node = random.randrange(2, 8) * 2 training_run.ram_per_node = f"{random.randrange(1,16) * 8}Gi" training_run.gpu_required = (random.randrange(1, 2) % 2) == 0 training_run.output_path = "test/models/output" training_run.training_params.learning_rate = 1 / (pow( 10, random.randint(0, 4))) training_run.training_params.loss = random.random() training_run.training_params.batch_size = random.randrange(1, 5) * 500 training_run.training_params.epoch = random.randrange(1, 8) * 25 training_run.training_params.optimizer = ["SGD"] training_run.training_params.other_tags = { "pii": False, "data_sha": "8b03f70" } training_run.extended_properties = {} environment_dict_train = YAML.safe_load(f""" INPUT_schemas_directory: '.parameters/schemas' INPUT_schemas_git_url: 'https://github.com/mlspec/mlspeclib-action-samples-schemas.git' INPUT_workflow_node_id: 'workflow|500.0.1|31ca83ed-8263-4c8c-8672-7a2163a34725' INPUT_step_name: {step_name} INPUT_input_parameters_base64: {process_data_encoded_val} INPUT_execution_parameters_raw: {training_run.dict_without_internal_variables()} INPUT_METASTORE_CREDENTIALS: {credentials} GITHUB_RUN_ID: {str(run_id)} GITHUB_WORKSPACE: '/src' """) self.run_container(repo_name, "mlspeclib-action-samples-train", environment_dict_train) buff_val = self.buffer.getvalue() m = re.search(output_regex, buff_val) train_encoded_val = m.group(1) # train_output_value = base64.urlsafe_b64decode(train_encoded_val) self.buffer.truncate(0) self.buffer.seek(0) step_name = "package" package_run = MLObject() package_run.set_type("500.0.1", "package_run") package_run.run_id = run_id package_run.step_id = str(uuid.uuid4()) package_run.run_date = run_date_start.isoformat() package_run.model_source = "/nfs/trained_models/nlp" package_run.container_registry = f"https://registry.hub.docker.com/v1/repositories/contoso/nlp/{get_random_md5()}" # noqa package_run.agent_pool = "nlp-build-pool" package_run.build_args = ["arg1", "arg2", "arg3"] package_run.extended_properties = {} package_run.secrets = { "credentials": "AZURE_CREDENTIALS", "docker_username": "******", "docker_password": "******", } environment_dict_package = YAML.safe_load(f""" INPUT_schemas_directory: '.parameters/schemas' INPUT_schemas_git_url: 'https://github.com/mlspec/mlspeclib-action-samples-schemas.git' INPUT_workflow_node_id: 'workflow|500.0.1|31ca83ed-8263-4c8c-8672-7a2163a34725' INPUT_step_name: {step_name} INPUT_input_parameters_base64: {train_encoded_val} INPUT_execution_parameters_raw: {package_run.dict_without_internal_variables()} INPUT_METASTORE_CREDENTIALS: {credentials} GITHUB_RUN_ID: {str(run_id)} GITHUB_WORKSPACE: '/src' """) self.run_container(repo_name, "mlspeclib-action-samples-package", environment_dict_package) buff_val = self.buffer.getvalue() m = re.search(output_regex, buff_val) encoded_val = m.group(1) print(base64.urlsafe_b64decode(encoded_val)) self.buffer.flush()
def test_e2e(self): MLSchema.populate_registry() MLSchema.append_schema_to_registry(Path.cwd() / ".parameters" / "schemas") # Execute step input_parameters = { # Put sample required input parameters here } execution_parameters = { # Put sample required execution parameters here } # THESE SHOULD BE THE ONLY SETTINGS FOR THIS FILE step_name = "process_data" expected_results_schema_type = "data_result" # MUST BE A LOADED SCHEMA expected_results_schema_version = "500.0.1" # MUST BE A SEMVER step_execution_object = StepExecution(input_parameters, execution_parameters) results_object = MLObject() results_object.set_type( schema_type=expected_results_schema_type, schema_version=expected_results_schema_version, ) # Should error due to missing fields with self.assertRaises(ValueError) as context: verify_result_contract( results_object, expected_results_schema_type, expected_results_schema_version, step_name, ) self.assertTrue( f"Error verifying result object for '{step_name}.output'" in str( context.exception)) results_object = step_execution_object.execute( result_object_schema_type=expected_results_schema_type, result_object_schema_version=expected_results_schema_version, ) results_object.run_date = datetime.datetime.now() results_object.step_id = str(uuid.uuid4()) results_object.run_id = str(uuid.uuid4()) results_object.execution_profile.system_memory_utilization = random() results_object.execution_profile.network_traffic_in_bytes = randint( 7e9, 9e10) results_object.execution_profile.gpu_temperature = randint(70, 130) results_object.execution_profile.disk_io_utilization = random() results_object.execution_profile.gpu_percent_of_time_accessing_memory = random( ) results_object.execution_profile.cpu_utilization = random() results_object.execution_profile.gpu_utilization = random() results_object.execution_profile.gpu_memory_allocation = random() self.assertTrue( verify_result_contract(results_object, expected_results_schema_type, expected_results_schema_version, step_name))