def wrapper(*args, **kwargs): try: # check the config session existence if config and config[conf_sess_name]: return f(*args, **kwargs) except KeyError as e: logger.warning('No such config session in myeconfig')
def run(self, resources): if self.skip: self.metadata.update({'status': 'Done'}) self.metadata.update({'data': 0}) add_tag(self.dataset_id, { self.reader_type: self.metadata, }) self.status = DONE return args = self._task_arguments() self.p = subprocess.Popen( args, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, close_fds=True, ) unknown_output = [] self.before_run() try: while self.p.poll() is None: for line in utils.nonblocking_readlines(self.p.stdout): if self.aborted.is_set(): self.p.send_signal(signal.SIGKILL) if line is not None: self.output_log.write(line) line = line.strip() if line: if not self.process_output(line): unknown_output.append(line) else: time.sleep(0.05) time.sleep(0.01) # do not remove this line. except Exception as e: self.p.terminate() self.status = ERROR logger.warning('Convert dataset {} to {} fail. {}'.format( self.dataset_id, self.reader_type, str(e))) logger.debug(traceback.format_exc(e)) logger.debug(unknown_output) raise finally: self.after_run() if self.p.returncode != 0: self.status = ERROR logger.warning( 'Convert dataset {} to {} fail. return code {}, {}'.format( self.dataset_id, self.reader_type, self.p.returncode, self.error_message)) logger.debug(unknown_output) else: self.status = DONE
def inference(id, image): svc = scheduler.get_instance(id) # Connect to server by zmq try: context = zmq.Context() socket = context.socket(zmq.REQ) address = "ipc://{}/unix.socket".format(svc._dir) logger.debug("Connecting to {}".format(address)) socket.connect(address) socket.send(image) # waiting for reply objects message = socket.recv() except Exception as e: logger.warning('Inference fail: {}'.format(e)) return [] logger.debug("Reply: {}".format(message)) return json.loads(message)
def save(self): try: if not os.path.exists(self._dir): return tmpfile_path = self.path(self.SAVE_INFO + '.tmp') with open(tmpfile_path, 'wb') as tmpfile: data = json.dumps(self._dict, sort_keys=True, indent=4) tmpfile.write(data) file_path = self.path(self.SAVE_INFO) shutil.move(tmpfile_path, file_path) except KeyboardInterrupt: pass except Exception as e: logger.warning('Caught %s while saving run %s: %s' % (type(e).__name__, self.id, e)) logger.debug(traceback.format_exc(e)) return False
def run(self, resources): self.before_run() host, repo, image = self.parse_tag(self.image_tag) logger.info("Pulling {}... ".format(self.image_tag)) try: cli = docker.APIClient(base_url='unix://var/run/docker.sock') with app.app_context(): registry = get_registry(host) registry.login(cli) for line in cli.pull(self.image_tag, stream=True, decode=True): if "progressDetail" in line: progress = line["progressDetail"] if "current" in progress: percentag = float(progress['current']) / float( progress['total']) self.progress = percentag except Exception as e: logger.warning("failed to pull image, {}".format(e)) logger.info("done.") self.after_run() self.status = DONE
def new(username, name, image_tag, dataset_path, user_args, num_gpu, project, repo_path, parameters, parent): while True: id = RUNS_PREFIX + JOBS_PREFIX + str(uuid.uuid4()).replace('-', '')[:8] job_dir = os.path.join(RUNS_DIR, id) if not os.path.exists(job_dir): break try: args = user_args.split() for n, arg in enumerate(args): if arg.startswith("bk/"): args[n] = arg.replace('bk/', '/data/dataset/') if dataset_path: if dataset_path.startswith("bk/"): dataset_path = dataset_path.replace('bk/', '/data/dataset/') if not os.path.exists(dataset_path): raise ValueError("Cannot find dataset {}".format(dataset_path)) inst = Instance(id=id, username=username, name=name, image_tag=image_tag, dataset_path=dataset_path, user_args=args, num_gpu=num_gpu, project=project, status_history=[], repo_path=repo_path, parameters=parameters, parent=parent, child=[]) logger.debug("Create instance {}".format(inst.id)) except Exception as e: logger.warning('Caught %s while creating instance %s: %s' % (type(e).__name__, id, e)) logger.debug(traceback.format_exc(e)) raise e return inst
def run(self, resources): self.before_run() env = os.environ.copy() env['PYTHONPATH'] = os.pathsep.join( ['.', self._dir, env.get('PYTHONPATH', '')] + sys.path) gpus = [i for (i, _) in resources['gpus']] env['CUDA_VISIBLE_DEVICES'] = ','.join(str(g) for g in gpus) root = os.path.dirname(os.path.abspath(myelindl.__file__)) args = [ sys.executable, '-m', os.path.join(root, 'tools', 'unix_server'), '--checkpoint-path=%s' % self.checkpoint_path, '--job-dir=%s' % self._dir, ] logger.debug("run args: {}".format(args)) self.p = subprocess.Popen( args, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, cwd=self._dir, close_fds=True, env=env, ) try: sigterm_time = None # When was the SIGTERM signal sent sigterm_timeout = 120 # When should the SIGKILL signal be sent while self.p.poll() is None: for line in utils.nonblocking_readlines(self.p.stdout): # for line in self.p.stdout: if self.aborted.is_set(): if sigterm_time is None: # Attempt graceful shutdown self.p.send_signal(signal.SIGTERM) sigterm_time = time.time() self.status = ABORT break if line is not None: # Remove whitespace line = line.strip() if line: self.output_log.write('%s\n' % line) self.output_log.flush() else: time.sleep(0.05) if sigterm_time is not None and (time.time() - sigterm_time > sigterm_timeout): self.p.send_signal(signal.SIGKILL) logger.debug('Sent SIGKILL to task "%s"' % self.name) time.sleep(0.01) except Exception as e: logger.warning('service exception: {}'.format(e)) self.p.terminate() self.after_run() raise e self.after_run() if self.status != RUN: return False if self.p.returncode != 0: self.returncode = self.p.returncode self.status = ERROR else: self.status = DONE return True
def myconfig_value(keyword, def_value): try: return myconfig[SENAME][keyword] except KeyError as e: logger.warning('Key Error: %s, use def_value', keyword) return def_value