def _get_profiling_job_id(self): """Get profiling job id, which was generated by ada service. Returns: str: profiling jon id. """ if self._profiling_job_id: return self._profiling_job_id job_id = "" cmd = "ls -t " + self._output_path + "|grep JOB|awk '{print $1}'" r = os.popen(cmd) profiling_job_dirs = r.readlines() r.close() for item in profiling_job_dirs: path = os.path.join(self._output_path, item.strip()) log_file = get_file_names(path, "host_start.log") if not log_file: logger.error( "Profiling: job path %s, host_start.log not exist.", path) continue log_file = os.path.join(path, log_file[0]) item_dict = self._parse_host_start_log(log_file) if not item_dict: logger.error( "Profiling: job path %s, fail to get job start info.", path) continue if self._dev_id != item_dict["device_id"]: logger.info( "Profiling: job path %s, dev id %s, training device id %s.", path, item_dict["device_id"], self._dev_id) continue if self._start_time > int(item_dict["start_time"]): logger.info( "Profiling: job path %s, start_time %s, training start_time %d.", path, item_dict["start_time"], self._start_time) break job_id = item.strip() break if not job_id: msg = "Fail to get profiling job, please check whether job dir was generated" raise RuntimeError(msg) return job_id
def _get_profiling_job_id(self): """Get profiling job id, which was generated by ada service. Returns: str, profiling job id. """ job_id = "" for item in os.listdir(self._output_path): if item.startswith('JOB'): path = os.path.join(self._output_path, item) log_file = get_file_names(path, "host_start.log") if not log_file: logger.error( "Profiling: job path %s, host_start.log not exist.", path) continue training_device_id = log_file[0].split('.')[-1] if self._dev_id == training_device_id: log_file = os.path.join(path, log_file[0]) job_start_time = self._parse_host_start_log(log_file) if not job_start_time: logger.error( "Profiling: job path %s, fail to get job start info.", path) break job_id = item if self._start_time > int(job_start_time): logger.info( "Profiling: job path %s, start_time %s, training start_time %d.", path, job_start_time, self._start_time) break else: logger.info( "Profiling: job path %s, dev id %s, training device id %s.", path, training_device_id, self._dev_id) if not job_id: msg = "Fail to get profiling job, please check whether job dir was generated, " \ "or may be the device id from job dir dismatch the device_id in current process." raise RuntimeError(msg) return job_id
def _get_profiling_job_id(self): """Get profiling job id, which was generated by ada service. Returns: str, profiling job id. """ job_id = "" for item in os.listdir(self._output_path): if item.startswith('JOB'): path = os.path.join(self._output_path, item) log_file = get_file_names(path, "host_start.log") if not log_file: logger.error( "Profiling: job path %s, host_start.log not exist.", path) break log_file = os.path.join(path, log_file[0]) item_dict = self._parse_host_start_log(log_file) if not item_dict: logger.error( "Profiling: job path %s, fail to get job start info.", path) break job_id = item if self._dev_id != item_dict["device_id"]: logger.info( "Profiling: job path %s, dev id %s, training device id %s.", path, item_dict["device_id"], self._dev_id) if self._start_time > int(item_dict["start_time"]): logger.info( "Profiling: job path %s, start_time %s, training start_time %d.", path, item_dict["start_time"], self._start_time) break if not job_id: msg = "Fail to get profiling job, please check whether job dir was generated" raise RuntimeError(msg) return job_id