def read_desc_file(self, comp_path): self._logger.debug("Reading component's metadata: {}".format(comp_path)) comp_ref_json = os.path.join(comp_path, ComponentsDesc.COMPONENT_METADATA_REF_FILE) if os.path.isfile(comp_ref_json): with open(comp_ref_json, "r") as f: try: comp_ref = json.load(f) except ValueError as e: msg = "Failed to load (parse) component metadata's reference file! ref-file: {}".format(comp_ref_json) self._logger.error(msg) raise MLCompException(msg) metadata_filename = comp_ref[json_fields.COMPONENT_METADATA_REF_FILE_NAME_FIELD] comp_desc = ComponentsDesc._load_comp_desc(comp_path, metadata_filename) else: # Try to find any valid component's description file comp_desc_gen = ComponentsDesc.next_comp_desc(comp_path) try: # next() is called only once, because only one component JSON file is expected. _, comp_desc, _ = next(comp_desc_gen) except StopIteration: comp_desc = None if not comp_desc: msg = "Failed to find any valid component's json desc! comp_path: {}".format(comp_path) self._logger.error(msg) raise MLCompException(msg) return comp_desc
def set_dataframe(self, dataframe): if self._dataframe: if type(self._dataframe) is list: raise MLCompException("DataFrame list was already set for the given pipeline! pipeline: {}, " .format(self.name())) else: raise MLCompException("DataFrame was already set for the given pipeline! pipeline: {}, " "existing-columns: {}, new-columns: {}" .format(self.name(), self._dataframe.columns, dataframe.columns)) self._dataframe = dataframe
def _validate_output(self, rdds): if rdds: if type(rdds) is not list: raise MLCompException( "Invalid non-list output! Expecting for a list of RDDs!") for rdd in rdds: if not issubclass(rdd.__class__, RDD): raise MLCompException( "Invalid returned list of rdd types! Expecting for 'pyspark.rdd.RDD'! " "name: {}, type: {}".format(self.name(), type(df)))
def _find_comp_desc(self, pipe_comp): comp_type = pipe_comp[json_fields.PIPELINE_COMP_TYPE_FIELD] self._logger.debug("Component found in pipeline, id={}, type={}".format( pipe_comp[json_fields.PIPELINE_COMP_ID_FIELD], comp_type)) match_comp = [comp for comp in self._comp_desc_list if comp[json_fields.COMPONENT_DESC_NAME_FIELD] == comp_type] if not match_comp: raise MLCompException( "Could not find a component read from the pipeline! type=[{}] desc=[{}] pipe_comp=[{}]".format( comp_type, self._comp_desc_list, pipe_comp[json_fields.PIPELINE_COMP_ID_FIELD])) elif len(match_comp) > 1: raise MLCompException("Found more then one component! type=" + comp_type) return match_comp[0]
def _print_acc_messages(self): if not self.__logger: raise MLCompException("None logger! Invalid internal sequence!") if self._msg_container: for m in self._msg_container: self.__logger.info(m)
def set_output_model_path(self, path): if self._output_model_path: raise MLCompException( "Output model path was already set for the given pipeline! pipeline: {}, " "existing-path: {}, new-path: {}".format( self.name(), self._output_model_path, path)) self._output_model_path = path
def monitor(self): self._logger.info("Monitoring job ... {}".format(self._job_name)) while True: response = self._describe_job() if self._logger.isEnabledFor(logging.DEBUG): self._logger.debug(pprint.pformat(response, indent=4)) status = self._job_status(response) running_time_sec = self._total_running_time_sec(response) billing_time_sec = self._billing_time_sec(response) Report.job_status(self._job_name, running_time_sec, billing_time_sec, status) self._report_online_metrics(response) if status == SMApiConstants.JOB_COMPLETED: self._report_final_metrics(response) self._logger.info("Job '{}' completed!".format(self._job_name)) if self._on_complete_callback: self._on_complete_callback(response) break elif status == SMApiConstants.JOB_FAILED: msg = "Job '{}' failed! message: {}" \ .format(self._job_name, response[SMApiConstants.FAILURE_REASON]) self._logger.error(msg) raise MLCompException(msg) elif status != SMApiConstants.JOB_IN_PROGRESS: self._logger.warning("Unexpected job status! job-name: {}, status: {}" .format(self._job_name, status)) self._logger.info("Job '{}' is still running ... {} sec" .format(self._job_name, running_time_sec)) time.sleep(JobMonitorBase.MONITOR_INTERVAL_SEC)
def load(self, extended=True): components_desc = [] if not self._comp_root_path: try: # The following call to 'pkg_resources.resource_filename' actually extract all the files # from the component's egg file from 'parallelm.code_components' folder self._comp_root_path = pkg_resources.resource_filename(ComponentsDesc.CODE_COMPONETS_MODULE_NAME, '') self._logger.info("Cached components are at: {}".format(self._comp_root_path)) except ModuleNotFoundError: msg = "Either component's root path or component's egg file are missing!" self._logger.error(msg) raise MLCompException(msg) for comp_type in self._get_next_comp_type_in_pipeline(): self._logger.info("Handling {}".format(comp_type)) comp_path = os.path.join(self._comp_root_path, comp_type) if comp_path not in sys.path: sys.path.insert(0, comp_path) comp_desc = self.read_desc_file(comp_path) if extended: comp_desc[json_fields.COMPONENT_DESC_ROOT_PATH_FIELD] = comp_path self._add_default_values(comp_desc) self._logger.debug("Component loaded: " + str(comp_desc)) components_desc.append(comp_desc) return components_desc
def run_connected_pipeline(self, system_conf, ee_conf, engine_info): # Components configuration phase print("Running pipeline") self._logger.debug("Running connected pipeline") for dag_node in self._sorted_execution_graph_list: input_args = dag_node.input_arguments(system_conf, ee_conf) dag_node.component_runner.configure(input_args) # Components materialize phase for dag_node in self._sorted_execution_graph_list: parent_data_objs = self.parent_data_objs(dag_node) self._logger.debug("Calling dag node '{}', with args: {}".format(dag_node.comp_name(), parent_data_objs)) self._component_run_header(dag_node) start = time.time() sys.stderr.flush() sys.stdout.flush() data_objs = dag_node.component_runner.run(parent_data_objs) sys.stderr.flush() sys.stdout.flush() runtime_in_sec = time.time() - start if data_objs and type(data_objs) is not list: raise MLCompException("Invalid returned data type from component! It should be a list! " "name: " + dag_node.comp_name()) self._component_run_footer(dag_node, data_objs, runtime_in_sec) self._logger.debug("Output of dag node '{}' is: {}".format(dag_node.comp_name(), data_objs)) self.update_parent_data_objs(dag_node, data_objs) self._ml_engine.finalize()
def _setup_py4j_client_connection(self): gateway_params = GatewayParameters(port=self._java_port, auto_field=True, auto_close=True, eager_load=True) callback_server_params = CallbackServerParameters( port=0, daemonize=True, daemonize_connections=True, eager_load=True) self._gateway = JavaGateway( gateway_parameters=gateway_params, callback_server_parameters=callback_server_params, python_server_entry_point=self) self._component_via_py4j = self._gateway.entry_point.getComponent() if not self._component_via_py4j: raise MLCompException("None reference of py4j java object!") if self._verbose: self._logger.debug( self._prefix_msg + "Py4J component referenced successfully! comp_via_py4j: {}". format(self._component_via_py4j)) self._component_via_py4j.setEnvAttributes(self.get_wid(), self._verbose)
def _monitor_uwsgi_proc(self, stop_msg=None): try: monitor_stats = not stop_msg block_size = 2048 stderr_fd = 2 stdout_buff2lines = BufferToLines() stderr_buff2lines = BufferToLines() keep_reading = True last_stats_read = time.time() while keep_reading: read_fs = [self._stdout_pipe_r, self._stderr_pipe_r] # Sleep the exact time left within a 1 sec interval if monitor_stats: sleep_time = self._stats_reporting_interval_sec - (time.time() - last_stats_read) if sleep_time < 0: sleep_time = 0 else: sleep_time = self._stats_reporting_interval_sec readable_fd = select.select(read_fs, [], [], sleep_time)[0] if monitor_stats: wakeup_time = time.time() if wakeup_time - last_stats_read > self._stats_reporting_interval_sec: last_stats_read = wakeup_time if self._stats: self._stats.report() if readable_fd: for pipe in readable_fd: if pipe is self._stdout_pipe_r: buff = os.read(pipe, block_size) stdout_buff2lines.add(buff) for line in stdout_buff2lines.lines(): print(line) if stop_msg and stop_msg.encode() in buff: keep_reading = False if pipe is self._stderr_pipe_r: buff = os.read(pipe, block_size) stderr_buff2lines.add(buff) for line in stderr_buff2lines.lines(): os.write(stderr_fd, (line + '\n').encode()) else: rc = self._proc.poll() if rc is not None: if rc != 0: raise MLCompException("Error in 'uwsgi' server! rc: {}".format(rc)) break except MLCompException as e: self._cleanup() raise e
def add_related_metric(self, bar_graph_metric): if self.metric_relation != MetricRelation.BAR_GRAPH: raise MLCompException("Related metric can be added only to bar graph!") if not isinstance(bar_graph_metric, tuple) or len(bar_graph_metric) != 2: raise MLCompException("Related metric information should be a tuple of the metric itself and a" "bar column label! related_metric: {}".format(bar_graph_metric)) if not isinstance(bar_graph_metric[0], Metric): raise MLCompException("First element in related bar graph metric should be a Metric! " "provided: {}".format(bar_graph_metric[0])) if not isinstance(bar_graph_metric[1], six.string_types): raise MLCompException("Second element in related bar graph metric should be a string" "provided: {}".format(bar_graph_metric[1])) self._related_metric.append(bar_graph_metric)
def _call_class_attr(cls, attr_name): attr = getattr(cls, attr_name, None) if not attr: raise MLCompException( "The given class does not include the given attribute name! " + "class: {}, attr_name: {}".format(cls, attr_name)) attr_value = attr() if callable(attr) else attr return attr_value
def start(self): if not self._proc: raise MLCompException("uWSGI process was not setup for monitoring!") th = threading.Thread(target=self._run) self._monitor_info[UwsgiConstants.MONITOR_THREAD_KEY] = th th.start()
def __init__(self, handler, raw): if not inspect.ismethod(handler): raise MLCompException( "Invalid REST endpoint handler! Should be a component's method with the " "following prototype: <handler>(self, url_params, form_params), given: {}" .format(handler)) self._handler = handler self._raw = raw
def _materialize(self, parent_data_objs, user_data): self._init_params() tmp_dataset_filepath = os_util.tmp_filepath() self._logger.info( "Temporary dataset file path: {}".format(tmp_dataset_filepath)) try: urllib.request.urlretrieve( self._dataset_url, tmp_dataset_filepath, reporthook=DatasetDownloader._download_report_hook) self._logger.info("Dataset download completed ... 100%") train_set, valid_set, test_set = (None, None, None) with gzip.open(tmp_dataset_filepath, 'rb') as f: loaded_artifacts = pickle.load(f, encoding='latin1') try: train_set, valid_set, test_set = loaded_artifacts except ValueError: try: train_set, valid_set = loaded_artifacts except ValueError: train_set = loaded_artifacts self._logger.info("Dataset downloaded and loaded! " + "#samples in train set: {}, ".format( len(train_set[0]) if train_set else None) + "#samples in valid set: {}, ".format( len(valid_set[0]) if valid_set else None) + "#samples in test set: {}". format(len(test_set[0])) if test_set else None) if self._train_set_local_csv_filepath and train_set: self._save_to_csv(train_set[0], self._train_set_local_csv_filepath) if self._valid_set_local_csv_filepath and valid_set: self._save_to_csv(valid_set[0], self._valid_set_local_csv_filepath) if self._test_set_local_csv_filepath and test_set: self._save_to_csv(test_set[0], self._test_set_local_csv_filepath) return [train_set, valid_set, test_set] except Exception as e: msg = "Failed to download and read dataset!\n{}".format(e) self._logger.error(msg) raise MLCompException(msg) finally: self._logger.info( "Cleaning up temporary dataset file path: {}".format( tmp_dataset_filepath)) os_util.remove_file_safely(tmp_dataset_filepath)
def _read_execution_env_params(self): ee_config = self._pipeline.get('executionEnvironment', dict()).get('configs') if not ee_config: raise MLCompException( "Missing execution environment section in pipeline json!") eng_config = ee_config.get('engConfig') if not eng_config: raise MLCompException( "Missing execution environment engine section in pipeline json!" ) if eng_config['type'] != SageMakerEngine.TYPE: raise MLCompException( "Unexpected engine type in execution environment! expected: '{}', got: {}" .format(SageMakerEngine.TYPE, eng_config['type'])) return eng_config['arguments']
def _run(self, shared_conf): self._logger.info("Starting 'nginx' service ... cmd: '{}'".format(NginxConstants.START_CMD)) if self._dry_run: return rc = subprocess.check_call(NginxConstants.START_CMD, shell=True) if rc != 0: raise MLCompException("nginx service failed to start! It is suspected as not being installed!") self._logger.info("'nginx' service started successfully!")
def _server_conf_filepath(self, platform_name): if self._debian_platform(platform_name): d = NginxConstants.SERVER_CONF_DIR_DEBIAN elif self._redhat_platform(platform_name): d = NginxConstants.SERVER_CONF_DIR_REDHAT elif self._macos_platform(platform_name): if not os.path.isdir(NginxConstants.SERVER_CONF_DIR_MACOS): if not os.path.isdir(NginxConstants.NGINX_ROOT_MACOS): raise MLCompException( "'{}' does not exist or not a directory. Is nginx installed?" .format(NginxConstants.NGINX_ROOT_MACOS)) os.mkdir(NginxConstants.SERVER_CONF_DIR_MACOS) d = NginxConstants.SERVER_CONF_DIR_MACOS else: raise MLCompException( "Nginx cannot be configured! Platform is not supported: {}". format(platform_name)) return os.path.join(d, NginxConstants.SERVER_CONF_FILENAME)
def _server_conf_filepath(self, platform_name): if self._debian_platform(platform_name): d = NginxConstants.SERVER_CONF_DIR_DEBIAN elif self._redhat_platform(platform_name): d = NginxConstants.SERVER_CONF_DIR_REDHAT elif self._macos_platform(platform_name): d = NginxConstants.SERVER_CONF_DIR_MACOS else: raise MLCompException("Nginx cannot be configured! Platform is not supported: {}".format(platform_name)) return os.path.join(d, NginxConstants.SERVER_CONF_FILENAME)
def _materialize(self, parent_data_objs, user_data): if not parent_data_objs or len(parent_data_objs) != 3: raise MLCompException("Expecting 3 parent inputs! got: {}, parent_data: {}" .format(len(parent_data_objs), parent_data_objs)) self._init_params(parent_data_objs) self._convert_and_upload() self._do_training() self._monitor_job() self._download_model()
def _materialize(self, parent_data_objs, user_data): if len(parent_data_objs) != 1: raise MLCompException("Missing a mandatory s3 url for a file as input!") s3_url = parent_data_objs[0] if s3_url: local_filepath = self._params['local_filepath'] AwsHelper(self._logger).download_file(s3_url, local_filepath) else: self._logger.info("Nothing to download from AWS S3!")
def __init__(self, mlops, ml_engine, polling_interval_sec=10.0): super(BgActor, self).__init__() self.set_logger(ml_engine.get_engine_logger(self.logger_name())) if not mlops or not mlops.init_called: raise MLCompException("'mlops' was not setup properly!") self._mlops = mlops self._polling_interval_sec = polling_interval_sec self._condition = threading.Condition() self._stop_gracefully = False
def _load_comp_desc(root, filename): if filename.endswith(".json"): comp_json = os.path.join(root, filename) with open(comp_json) as f: try: comp_desc = json.load(f) except ValueError as ex: raise MLCompException("Found json file with invalid json format! filename: {}, exception: {}".format(comp_json, str(ex))) if ComponentsDesc.is_valid(comp_desc): return comp_desc return None
def _setup_env(self, eng_args_config): region = EeArg(eng_args_config.get('region')).value aws_access_key_id = EeArg( eng_args_config.get('aws_access_key_id')).value if not aws_access_key_id: raise MLCompException( "Empty 'aws_access_key_id' parameter in execution environment!" ) aws_secret_access_key = EeArg( eng_args_config.get('aws_secret_access_key')).value if not aws_secret_access_key: raise MLCompException( "Missing 'aws_secret_access_key' parameter in execution environment!" ) os.environ[SageMakerEngine.AWS_DEFAULT_REGION] = region os.environ[SageMakerEngine.AWS_ACCESS_KEY_ID] = aws_access_key_id os.environ[ SageMakerEngine.AWS_SECRET_ACCESS_KEY] = aws_secret_access_key
def _init_ml_engine(self, pipeline): engine_type = pipeline[json_fields.PIPELINE_ENGINE_TYPE_FIELD] self._logger.info("Engine type: {}".format(engine_type)) if engine_type == EngineType.PY_SPARK: from parallelm.ml_engine.py_spark_engine import PySparkEngine self._ml_engine = PySparkEngine( pipeline[json_fields.PIPELINE_NAME_FIELD], self._run_locally, self._spark_jars) self.set_logger( self._ml_engine.get_engine_logger(self.logger_name())) if mlops_loaded: mlops.init(self._ml_engine.context) elif engine_type == EngineType.GENERIC: from parallelm.ml_engine.python_engine import PythonEngine self._logger.info("Using python engine") self._ml_engine = PythonEngine( pipeline[json_fields.PIPELINE_NAME_FIELD], self._mlcomp_jar) self.set_logger( self._ml_engine.get_engine_logger(self.logger_name())) if mlops_loaded: # This initialization applies only to Python components and not to components # that are written in other languages (.e.g R). The reason for that is that # those components are executed within different process and thus need to # load and init the mlops library separately. mlops.init() elif engine_type == EngineType.REST_MODEL_SERVING: from parallelm.ml_engine.rest_model_serving_engine import RestModelServingEngine self._logger.info("Using REST Model Serving engine") self._ml_engine = RestModelServingEngine( pipeline[json_fields.PIPELINE_NAME_FIELD], self._mlcomp_jar, self._standalone) self.set_logger( self._ml_engine.get_engine_logger(self.logger_name())) if mlops_loaded: # This initialization applies only to Python components and not to components # that are written in other languages (.e.g R). The reason for that is that # those components are executed within different process and thus need to # load and init the mlops library separately. mlops.init() else: raise MLCompException( "Engine type is not supported by the Python execution engine! engineType: " + engine_type) if mlops_loaded: self._ml_engine.run(mlops, pipeline)
def __init__(self, name, title=None, hidden=False, metric_type=MetricType.COUNTER, value_type=int, metric_relation=None, related_metric=None): super(Metric, self).__init__(logging.getLogger(self.logger_name())) self._metric_name = name + Metric.NAME_SUFFIX self._title = title self._hidden = hidden self._metric_type = metric_type self._value_type = value_type self._metric_relation = metric_relation self._metric_already_displayed = False if not self._hidden and not self._title: raise MLCompException("A metric can be seen in the UI only if 'title' is provided! name: {}" .format(name)) if self.metric_relation == MetricRelation.BAR_GRAPH: if not isinstance(related_metric, list): raise MLCompException("Bar graph metric should be provided with a list of metrics tuples. " "Each tuple should contain the related metric and its bar name! " "name: {}, related_metrics: {}".format(self.name, self.related_metric)) self._related_metric = [] for m in related_metric: self.add_related_metric(m) else: self._related_metric = related_metric if isinstance(related_metric, list) else [related_metric] if self._related_metric[0] and self._related_metric[0].metric_type != metric_type: raise MLCompException("Error in metrics relation! Given metric cannot relate to other metric of " "different type!" + " mentric: {}, type: {}, related-metric: {}, type: {}" .format(name, metric_type, self._related_metric[0].metric_name, self._related_metric[0].metric_type)) if name in Metric._metrics: raise MLCompException("Metric has already been defined! name: {}".name) self._logger.info("Add new uwsgi metric ... {}".format(self._metric_name)) Metric._metrics[self._metric_name] = self
def _materialize(self, parent_data_objs, user_data): if not parent_data_objs: raise MLCompException( "Missing expected dataset S3 url from parent input!") if not self._init_params(parent_data_objs): return self._upload_model_to_s3() self._create_model() self._create_transformation_job() self._monitor_job() return [self._predictions_s3_url()]
def _visit(self, t_node): self._logger.debug("Visiting node: {}".format(t_node.key)) if t_node.perm_visit: return if t_node.temp_visit: raise MLCompException( "The pipeline has invalid cyclic loop (Not a DAG)! pipe-node-id: {}" .format(dag_node.pipe_id())) t_node.temp_visit = True for child_key in t_node.child_keys: if child_key not in self._graph_aux: raise MLCompException( "Child id was not found in the graph! key: {}".format( child_key)) self._visit(self._graph_aux[child_key]) t_node.temp_visit = False t_node.perm_visit = True self._sorted_graph.append(t_node.node)
def _load_pipeline(self): if self._pipeline: return self._pipeline if self._json_pipeline: self._pipeline = json.loads(self._json_pipeline) elif self._pipeline_file: self._pipeline = json.load(self._pipeline_file) else: raise MLCompException("Missing pipeline file!") # Validations if json_fields.PIPELINE_PIPE_FIELD not in self._pipeline: raise MLCompException("Pipeline does not contain any component! pipeline=" + str(self._pipeline)) if mlops_loaded: pipeline_str = mask_passwords(str(self._pipeline)) else: pipeline_str = str(self._pipeline) self._logger.debug("Pipeline: " + pipeline_str) return self._pipeline