def _get_progress_from_file(self): log_location = '/'.join([self._working_dir, 'progress.log']) if _file_util.exists(log_location): content = _file_util.read(log_location) if content: return content[:-1] else: __LOGGER__.warn('Progress log file cannot be found') return ""
def get_results(self): """ Retrieve the result from this job. This is a BLOCKING function. It will block until the job is completed and a usable result can be returned (usually a trained model). Raises an RuntimeError if job fails for any reason, or if the job succeeds but the result cannot be returned. Returns ------- out : many types """ if self._result is not None: return self._result (retcode, output) = self._job_handle.wait() final_state_ready = False while not final_state_ready: try: state = self.get_final_state() final_state_ready = True except RuntimeError: pass time.sleep(1) if state == JobHandle.FINAL_STATE.SUCCESS: result_location = '/'.join([self._working_dir, 'out']) ret_str = _file_util.read(result_location) if ret_str is None: raise RuntimeError("Failed to read from expected result location: " + str(result_location)) self._dml_serdes.from_str(ret_str) result_dict = self._dml_serdes.to_dict() if 'exception' in result_dict: __LOGGER__.debug("Found exception in DML result. Result: " + str(result_dict)) raise ToolkitError(result_dict['exception']) if 'result' not in result_dict: __LOGGER__.debug("Model not found in DML result. Result: " + str(result_dict)) raise RuntimeError("Model not found, though job completed successfully!") dml_result = result_dict['result'] result_ctor = _supported_result_objects[self._algo_name] self._result = result_ctor(dml_result) return self._result else: self._failed_client_output = output __LOGGER__.debug("Client failed. Return code: " + str(retcode) +\ "\nSTDOUT:\n" + output[0] + \ "\nSTDERR:\n" + output[1]) __LOGGER__.debug("Current state: " + str(self.get_state())) raise RuntimeError("Job failed with final state " + str(state) +\ ". Use the status_summary method to diagnose the issue.")
def _dml_read_app_metric_server(working_dir): if sys.version_info.major == 2: from urllib2 import urlopen else: from urllib.request import urlopen metric_server_address = os.path.join(working_dir, 'metric_server_address') url = file_util.read(metric_server_address) if not url or url[-1] != '$': return '' else: url = url[:-1] + '/progress' logger.info('Open url %s' % url) return urlopen(url).read()
def get_log_metric_server_address(log_server_address_file, timeout=120): starttime = time.time() try: while(not file_util.exists(log_server_address_file)): time.sleep(.05) if (time.time() - starttime) > timeout: __logger__.warning('Unable to get server log (timeout reached)') return "" ret_str = file_util.read(log_server_address_file) if ret_str.endswith('$'): return ret_str[:-1] except Exception as e: __logger__.warning(e) return ""
def get_log_metric_server_address(log_server_address_file, timeout=120): starttime = time.time() try: while (not file_util.exists(log_server_address_file)): time.sleep(.05) if (time.time() - starttime) > timeout: __logger__.warning( 'Unable to get server log (timeout reached)') return "" ret_str = file_util.read(log_server_address_file) if ret_str.endswith('$'): return ret_str[:-1] except Exception as e: __logger__.warning(e) return ""
def receive_from_file(self): try: if file_util.exists(self.file_url): __logger__.debug("Read from %s" % self.file_url) content = file_util.read(self.file_url) leftover_progress_content = content[len(self.total_received_message):] # Final log file incomplete if not leftover_progress_content.endswith('$'): return False if len(leftover_progress_content): self.out.write(leftover_progress_content[:-1]) # ignore $ self.out.flush() self.total_received_message += leftover_progress_content return True except Exception as e: __logger__.warning(e) return False
def receive_from_file(self): try: if file_util.exists(self.file_url): __logger__.debug("Read from %s" % self.file_url) content = file_util.read(self.file_url) leftover_progress_content = content[ len(self.total_received_message):] # Final log file incomplete if not leftover_progress_content.endswith('$'): return False if len(leftover_progress_content): self.out.write(leftover_progress_content[:-1]) # ignore $ self.out.flush() self.total_received_message += leftover_progress_content return True except Exception as e: __logger__.warning(e) return False
def _get_job_log_server_address(self, timeout=10): if self._log_server_address: return self._log_server_address log_server_address_file = '/'.join([self._working_dir, 'metric_server_address']) starttime = time.time() timeout = False __LOGGER__.info('Waiting for log server address to be available') while(not _file_util.exists(log_server_address_file)): time.sleep(1) if (time.time() - starttime) > timeout: __LOGGER__.info('Timeout waiting for log server address') timeout = True break if not timeout: ret_str = _file_util.read(log_server_address_file) if ret_str.endswith('$'): self._log_server_address = ret_str[:-1] + "/progress" __LOGGER__.info('Log server address: %s' % self._log_server_address) return self._log_server_address return ""
def _dml_read_app_progress_file(working_dir): progress_file = os.path.join(working_dir, 'progress.log') lines = file_util.read(progress_file) return lines
def dml_exec(function_name, data, env='auto', verbose=True, **kwargs): """ Executes a distributed ml function Parameters ---------- function_name : str Name of the distributed function to be executed. The function symbol must exists in the unity distributed shared library. data : dict Key value arguments to the function stored in a dictionary env : DMLEnvironemnt Contains job environment parameters and a job submit function. **kwargs : dict Additional options. See _get_worker_args and _get_commander_args. - check_hdfs : {0, 1} Perform sanity check for hdfs read and write - startup_timeout : int Timeout in seconds for cluster setup Return ------ (success, message, result_path) : bool, str, str """ from graphlab.extensions import dml_function_invocation, init_dml_class_registry init_dml_class_registry() if env == 'auto': env = DMLRemoteEnvironment() if not file_util.exists(env.working_dir): _log.debug('Creating working directory: %s' % env.working_dir) file_util.mkdir(env.working_dir) else: _log.debug('Using existing working directory: %s' % env.working_dir) _log.info('Running distributed execution with %d workers. Working directory: %s' % (env.num_workers, env.working_dir)) success = False message = "" result_path = None # Job function arguments try: _log.info('Serializing arguments to %s' % env.working_dir) args = dml_function_invocation() data_copy = copy(data) internal_working_dir = _make_internal_url(env.working_dir) data_copy['__base_path__'] = internal_working_dir args.from_dict(data_copy, internal_working_dir) json_data = args.to_str() # sanitize the base path url sanitized_json_data = json_data if file_util.is_s3_path(json_data): sanitized_json_data = _sanitize_internal_s3_url(json_data) _log.info('Serialized arguments: %s' % sanitized_json_data) except Exception as e: success = False message = 'Error serializing arguments. %s' % str(e) return (success, message, None) # Submit job try: job = dml_submit(function_name, json_data, env, metric_server_address_file=COMMANDER_LOG_SERVER_ADDRESS_FILE, logprogress_file=PROGRESS_LOG_FILE, **kwargs) except KeyboardInterrupt: message = 'Canceled by user' return (success, message, None) _log.info('Waiting for workers to start ... ') logprinter = None if verbose: log_server_address_path = os.path.join(env.working_dir, COMMANDER_LOG_SERVER_ADDRESS_FILE) log_server_address = get_log_metric_server_address(log_server_address_path, timeout=INIT_TIMEOUT_PER_WORKER * env.num_workers) if len(log_server_address) > 0: tmp_log_dir = tempfile.mkdtemp(prefix='graphlab_dml_log_') fd_list = [] logprinter = LogPrinter() # Attach log progress stream logprinter.add_stream(LogStream(log_server_address + '/progress', os.path.join(env.working_dir, PROGRESS_LOG_FILE), sys.stdout)) # Attach commander log stream local_commander_log = open(os.path.join(tmp_log_dir, COMMANDER_LOG_FILE), 'w') fd_list.append(local_commander_log) logprinter.add_stream(LogStream(log_server_address + '/commander', os.path.join(env.working_dir, COMMANDER_LOG_FILE), local_commander_log)) # Attach worker log streams for i in range(env.num_workers): local_worker_log = open(os.path.join(tmp_log_dir, WORKER_LOG_FILE(i)), 'w') fd_list.append(local_worker_log) logprinter.add_stream(LogStream(log_server_address + '/worker%d' % i, os.path.join(env.working_dir, WORKER_LOG_FILE(i)), local_worker_log)) logprinter.start() _log.info('Success. Worker logs are avaiable at %s ' % tmp_log_dir) _log.debug('Wait for job to finish') (success, message) = _wait_and_parse_job_result(job) if logprinter: logprinter.stop() for fd in fd_list: fd.close() if success: try: result_path = os.path.join(env.working_dir, env.output_name) ret_str = file_util.read(result_path) sanitized_ret_str = _sanitize_internal_s3_url(ret_str) _log.debug('Deserializing results: %s' % sanitized_ret_str) args.from_str(ret_str) response = args.to_dict() # Check toolkit response for "result" key or "exception" key. if 'result' in response: return (success, message, response['result']) elif 'exception' in response: return (False, response['exception'], None) else: raise ValueError('Invalid toolkit response. Must have "result" or \ "exception" as key') except Exception as e: success = False message = 'Error deserializing results. %s' % str(e) return (success, message, None) else: return (success, message, None)