def _get_progress_from_file(self): log_location = '/'.join([self._working_dir, 'progress.log']) if _file_util.exists(log_location): content = _file_util.read(log_location) if content: return content[:-1] else: __LOGGER__.warn('Progress log file cannot be found') return ""
def load_model(location): """ Load any GraphLab Create model that was previously saved. This function assumes the model (can be any model) was previously saved in GraphLab Create model format with model.save(filename). Parameters ---------- location : string Location of the model to load. Can be a local path or a remote URL. Because models are saved as directories, there is no file extension. Examples ---------- >>> model.save('my_model_file') >>> loaded_model = gl.load_model('my_model_file') """ _mt._get_metric_tracker().track('toolkit.model.load_model') # Check if the location is a dir_archive, if not, use glunpickler to load # as pure python model # We need to fix this sometime, but here is the explanation of the stupid # check below: # # If the location is a http location, skip the check, and directly proceed # to load model as dir_archive. This is because # 1) exists() does not work with http protocol, and # 2) GLUnpickler does not support http if (not file_util.get_protocol(location) in ['http', 'https']) and \ (not file_util.exists(location + '/dir_archive.ini')): # Not a ToolkitError so try unpickling the model. unpickler = gl_pickle.GLUnpickler(location) # Get the version version = unpickler.load() # Load the class name. cls_name = unpickler.load() cls = _get_class_from_name(cls_name) # Load the object with the right version. model = cls._load_version(unpickler, version) unpickler.close() # Return the model return model else: _internal_url = _make_internal_url(location) return glconnect.get_unity().load_model(_internal_url)
def get_log_metric_server_address(log_server_address_file, timeout=120): starttime = time.time() try: while(not file_util.exists(log_server_address_file)): time.sleep(.05) if (time.time() - starttime) > timeout: __logger__.warning('Unable to get server log (timeout reached)') return "" ret_str = file_util.read(log_server_address_file) if ret_str.endswith('$'): return ret_str[:-1] except Exception as e: __logger__.warning(e) return ""
def get_log_metric_server_address(log_server_address_file, timeout=120): starttime = time.time() try: while (not file_util.exists(log_server_address_file)): time.sleep(.05) if (time.time() - starttime) > timeout: __logger__.warning( 'Unable to get server log (timeout reached)') return "" ret_str = file_util.read(log_server_address_file) if ret_str.endswith('$'): return ret_str[:-1] except Exception as e: __logger__.warning(e) return ""
def receive_from_file(self): try: if file_util.exists(self.file_url): __logger__.debug("Read from %s" % self.file_url) content = file_util.read(self.file_url) leftover_progress_content = content[len(self.total_received_message):] # Final log file incomplete if not leftover_progress_content.endswith('$'): return False if len(leftover_progress_content): self.out.write(leftover_progress_content[:-1]) # ignore $ self.out.flush() self.total_received_message += leftover_progress_content return True except Exception as e: __logger__.warning(e) return False
def receive_from_file(self): try: if file_util.exists(self.file_url): __logger__.debug("Read from %s" % self.file_url) content = file_util.read(self.file_url) leftover_progress_content = content[ len(self.total_received_message):] # Final log file incomplete if not leftover_progress_content.endswith('$'): return False if len(leftover_progress_content): self.out.write(leftover_progress_content[:-1]) # ignore $ self.out.flush() self.total_received_message += leftover_progress_content return True except Exception as e: __logger__.warning(e) return False
def _get_job_log_server_address(self, timeout=10): if self._log_server_address: return self._log_server_address log_server_address_file = '/'.join([self._working_dir, 'metric_server_address']) starttime = time.time() timeout = False __LOGGER__.info('Waiting for log server address to be available') while(not _file_util.exists(log_server_address_file)): time.sleep(1) if (time.time() - starttime) > timeout: __LOGGER__.info('Timeout waiting for log server address') timeout = True break if not timeout: ret_str = _file_util.read(log_server_address_file) if ret_str.endswith('$'): self._log_server_address = ret_str[:-1] + "/progress" __LOGGER__.info('Log server address: %s' % self._log_server_address) return self._log_server_address return ""
def dml_exec(function_name, data, env='auto', verbose=True, **kwargs): """ Executes a distributed ml function Parameters ---------- function_name : str Name of the distributed function to be executed. The function symbol must exists in the unity distributed shared library. data : dict Key value arguments to the function stored in a dictionary env : DMLEnvironemnt Contains job environment parameters and a job submit function. **kwargs : dict Additional options. See _get_worker_args and _get_commander_args. - check_hdfs : {0, 1} Perform sanity check for hdfs read and write - startup_timeout : int Timeout in seconds for cluster setup Return ------ (success, message, result_path) : bool, str, str """ from graphlab.extensions import dml_function_invocation, init_dml_class_registry init_dml_class_registry() if env == 'auto': env = DMLRemoteEnvironment() if not file_util.exists(env.working_dir): _log.debug('Creating working directory: %s' % env.working_dir) file_util.mkdir(env.working_dir) else: _log.debug('Using existing working directory: %s' % env.working_dir) _log.info('Running distributed execution with %d workers. Working directory: %s' % (env.num_workers, env.working_dir)) success = False message = "" result_path = None # Job function arguments try: _log.info('Serializing arguments to %s' % env.working_dir) args = dml_function_invocation() data_copy = copy(data) internal_working_dir = _make_internal_url(env.working_dir) data_copy['__base_path__'] = internal_working_dir args.from_dict(data_copy, internal_working_dir) json_data = args.to_str() # sanitize the base path url sanitized_json_data = json_data if file_util.is_s3_path(json_data): sanitized_json_data = _sanitize_internal_s3_url(json_data) _log.info('Serialized arguments: %s' % sanitized_json_data) except Exception as e: success = False message = 'Error serializing arguments. %s' % str(e) return (success, message, None) # Submit job try: job = dml_submit(function_name, json_data, env, metric_server_address_file=COMMANDER_LOG_SERVER_ADDRESS_FILE, logprogress_file=PROGRESS_LOG_FILE, **kwargs) except KeyboardInterrupt: message = 'Canceled by user' return (success, message, None) _log.info('Waiting for workers to start ... ') logprinter = None if verbose: log_server_address_path = os.path.join(env.working_dir, COMMANDER_LOG_SERVER_ADDRESS_FILE) log_server_address = get_log_metric_server_address(log_server_address_path, timeout=INIT_TIMEOUT_PER_WORKER * env.num_workers) if len(log_server_address) > 0: tmp_log_dir = tempfile.mkdtemp(prefix='graphlab_dml_log_') fd_list = [] logprinter = LogPrinter() # Attach log progress stream logprinter.add_stream(LogStream(log_server_address + '/progress', os.path.join(env.working_dir, PROGRESS_LOG_FILE), sys.stdout)) # Attach commander log stream local_commander_log = open(os.path.join(tmp_log_dir, COMMANDER_LOG_FILE), 'w') fd_list.append(local_commander_log) logprinter.add_stream(LogStream(log_server_address + '/commander', os.path.join(env.working_dir, COMMANDER_LOG_FILE), local_commander_log)) # Attach worker log streams for i in range(env.num_workers): local_worker_log = open(os.path.join(tmp_log_dir, WORKER_LOG_FILE(i)), 'w') fd_list.append(local_worker_log) logprinter.add_stream(LogStream(log_server_address + '/worker%d' % i, os.path.join(env.working_dir, WORKER_LOG_FILE(i)), local_worker_log)) logprinter.start() _log.info('Success. Worker logs are avaiable at %s ' % tmp_log_dir) _log.debug('Wait for job to finish') (success, message) = _wait_and_parse_job_result(job) if logprinter: logprinter.stop() for fd in fd_list: fd.close() if success: try: result_path = os.path.join(env.working_dir, env.output_name) ret_str = file_util.read(result_path) sanitized_ret_str = _sanitize_internal_s3_url(ret_str) _log.debug('Deserializing results: %s' % sanitized_ret_str) args.from_str(ret_str) response = args.to_dict() # Check toolkit response for "result" key or "exception" key. if 'result' in response: return (success, message, response['result']) elif 'exception' in response: return (False, response['exception'], None) else: raise ValueError('Invalid toolkit response. Must have "result" or \ "exception" as key') except Exception as e: success = False message = 'Error deserializing results. %s' % str(e) return (success, message, None) else: return (success, message, None)
def dml_submit(function_name, str_data, env, **kwargs): """ Executes a distributed ml function Parameters ---------- function_name : str Name of the distributed function to be executed. The function symbol must exists in the unity distributed shared library. str_data : str Arguments as serialized string to be passed to the distributed function. env : DMLEnvironemnt Contains job environment parameters and a job submit function. **kwargs : dict Additional options. See _get_worker_args and _get_commander_args. - check_hdfs : {0, 1} Perform sanity check for hdfs read and write - startup_timeout : int Timeout in seconds for cluster setup Return ------ job : map_job """ _log.debug('Submitting job') if not file_util.exists(env.working_dir): file_util.mkdir(env.working_dir) map_job_args = _get_dml_exec_args(function_name, str_data, env, output_name=env.output_name, **kwargs) _log.debug('job arguments: %s' % str(map_job_args)) # The following code achieve the same as # """return env.submit(subprocess_exec, map_job_args)""" # but requires one less container. (Having commander code taking one entire container is wasteful) # It uses group_exec and pack the commander function and the first worker # function into one map tasks. The rest workers stay the same. # group_exec returns a list of results, so the output is a nested list of results, # we overload the job.get_results function to flatten the results. def commander_exec(): return lambda: subprocess_exe(**map_job_args[0]) def worker_exec(i): return lambda: subprocess_exe(**map_job_args[i + 1]) worker_to_function_group = [[worker_exec(i)] for i in range(env.num_workers)] worker_to_function_group[0].insert(0, commander_exec()) job = env.submit(group_exec, [{'lambdas': fgroup} for fgroup in worker_to_function_group]) # Decoreate the job get_results function to flatten the results def flatten_results(packed_results): return [item for sublist in packed_results for item in sublist] def decorate_with_flatten_results(f_original): def f_decorated(): results = f_original() return flatten_results(results) return f_decorated job.get_results = decorate_with_flatten_results(job.get_results) return job
def load_model(location): """ Load any GraphLab Create model that was previously saved. This function assumes the model (can be any model) was previously saved in GraphLab Create model format with model.save(filename). Parameters ---------- location : string Location of the model to load. Can be a local path or a remote URL. Because models are saved as directories, there is no file extension. Examples ---------- >>> model.save('my_model_file') >>> loaded_model = gl.load_model('my_model_file') """ _mt._get_metric_tracker().track('toolkit.model.load_model') # Check if the location is a dir_archive, if not, use glunpickler to load # as pure python model # We need to fix this sometime, but here is the explanation of the stupid # check below: # # If the location is a http location, skip the check, and directly proceed # to load model as dir_archive. This is because # 1) exists() does not work with http protocol, and # 2) GLUnpickler does not support http protocol = file_util.get_protocol(location) dir_archive_exists = False if protocol == '': model_path = file_util.expand_full_path(location) dir_archive_exists = file_util.exists( os.path.join(model_path, 'dir_archive.ini')) else: model_path = location if protocol in ['http', 'https']: dir_archive_exists = True else: import posixpath dir_archive_exists = file_util.exists( posixpath.join(model_path, 'dir_archive.ini')) if not dir_archive_exists: # Not a ToolkitError so try unpickling the model. unpickler = gl_pickle.GLUnpickler(location) # Get the version version = unpickler.load() # Load the class name. cls_name = unpickler.load() cls = _get_class_from_name(cls_name) # Load the object with the right version. model = cls._load_version(unpickler, version) unpickler.close() # Return the model return model else: _internal_url = _make_internal_url(location) return glconnect.get_unity().load_model(_internal_url)