def __init__(self, username, dataset_id, input_dir, reader_type, parameters={}): logger.debug('Convert inint {} '.format(reader_type)) id = 'conv-{}-{}-{}'.format(username, reader_type, time.time()) name = id super(ConversionTask, self).__init__(id, username, name, persistent=False) self.dataset_id = dataset_id self.input_dir = input_dir if input_dir else get_dataset_path( self.dataset_id) self.reader_type = reader_type self.output_dir = self._get_output_dir() self.parameters = parameters self.metadata = {} self.error_message = [] self.skip = reader_type in SKIP_CONVERT logger.debug('Convert init finish {} '.format(self.reader_type)) self.status = INIT
def pkg_clone(repo_path, parent_dir, child_dir): repo_name = _repo_name(repo_path) src_to = os.path.join(child_dir, repo_name) #if not os.path.exists(src_to): # os.makedirs(src_to) src_from = os.path.join(parent_dir, repo_name) logger.debug("Copy pkg {} to {}".format(src_from, src_to)) system_copy(src_from, src_to)
def inference(id, image): svc = scheduler.get_instance(id) # Connect to server by zmq try: context = zmq.Context() socket = context.socket(zmq.REQ) address = "ipc://{}/unix.socket".format(svc._dir) logger.debug("Connecting to {}".format(address)) socket.connect(address) socket.send(image) # waiting for reply objects message = socket.recv() except Exception as e: logger.warning('Inference fail: {}'.format(e)) return [] logger.debug("Reply: {}".format(message)) return json.loads(message)
def save(self): try: if not os.path.exists(self._dir): return tmpfile_path = self.path(self.SAVE_INFO + '.tmp') with open(tmpfile_path, 'wb') as tmpfile: data = json.dumps(self._dict, sort_keys=True, indent=4) tmpfile.write(data) file_path = self.path(self.SAVE_INFO) shutil.move(tmpfile_path, file_path) except KeyboardInterrupt: pass except Exception as e: logger.warning('Caught %s while saving run %s: %s' % (type(e).__name__, self.id, e)) logger.debug(traceback.format_exc(e)) return False
def get_value_decorator(config, conf_sess_name): logger.debug("config = {}, \nconf_sess_name = {}".format( config, conf_sess_name)) def decorator(f): @wraps(f) def wrapper(*args, **kwargs): try: # check the config session existence if config and config[conf_sess_name]: return f(*args, **kwargs) except KeyError as e: logger.warning('No such config session in myeconfig') return wrapper return decorator
def new(username, name, image_tag, dataset_path, user_args, num_gpu, project, repo_path, parameters, parent): while True: id = RUNS_PREFIX + JOBS_PREFIX + str(uuid.uuid4()).replace('-', '')[:8] job_dir = os.path.join(RUNS_DIR, id) if not os.path.exists(job_dir): break try: args = user_args.split() for n, arg in enumerate(args): if arg.startswith("bk/"): args[n] = arg.replace('bk/', '/data/dataset/') if dataset_path: if dataset_path.startswith("bk/"): dataset_path = dataset_path.replace('bk/', '/data/dataset/') if not os.path.exists(dataset_path): raise ValueError("Cannot find dataset {}".format(dataset_path)) inst = Instance(id=id, username=username, name=name, image_tag=image_tag, dataset_path=dataset_path, user_args=args, num_gpu=num_gpu, project=project, status_history=[], repo_path=repo_path, parameters=parameters, parent=parent, child=[]) logger.debug("Create instance {}".format(inst.id)) except Exception as e: logger.warning('Caught %s while creating instance %s: %s' % (type(e).__name__, id, e)) logger.debug(traceback.format_exc(e)) raise e return inst
def run(self, resources): if self.skip: self.metadata.update({'status': 'Done'}) self.metadata.update({'data': 0}) add_tag(self.dataset_id, { self.reader_type: self.metadata, }) self.status = DONE return args = self._task_arguments() self.p = subprocess.Popen( args, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, close_fds=True, ) unknown_output = [] self.before_run() try: while self.p.poll() is None: for line in utils.nonblocking_readlines(self.p.stdout): if self.aborted.is_set(): self.p.send_signal(signal.SIGKILL) if line is not None: self.output_log.write(line) line = line.strip() if line: if not self.process_output(line): unknown_output.append(line) else: time.sleep(0.05) time.sleep(0.01) # do not remove this line. except Exception as e: self.p.terminate() self.status = ERROR logger.warning('Convert dataset {} to {} fail. {}'.format( self.dataset_id, self.reader_type, str(e))) logger.debug(traceback.format_exc(e)) logger.debug(unknown_output) raise finally: self.after_run() if self.p.returncode != 0: self.status = ERROR logger.warning( 'Convert dataset {} to {} fail. return code {}, {}'.format( self.dataset_id, self.reader_type, self.p.returncode, self.error_message)) logger.debug(unknown_output) else: self.status = DONE
def offer_resources(self, resources): port_identifiers = [] if self.port_list: for resource in resources['ports']: if resource.remaining() >= 1: # check for port availability cmd = 'docker run --rm -p {}:8888 {} bash'.format( resource.identifier, DEFAULT_IMG) ret = os.system(cmd) if ret == 0: port_identifiers.append(resource.identifier) if len(port_identifiers) == len(self.port_list): break logger.debug("port_identifiers: {}".format(port_identifiers)) gpu_identifiers = [] if self.num_gpu: for resource in resources['gpus']: if resource.remaining() >= 1: gpu_identifiers.append(resource.identifier) if len(gpu_identifiers) == self.num_gpu: break logger.debug("gpu_identifiers: {}".format(gpu_identifiers)) if (len(port_identifiers) == len(self.port_list)) and \ (len(gpu_identifiers) == self.num_gpu): logger.debug("return resources.. ") resources = { 'gpus': [(i, 1) for i in gpu_identifiers], 'ports': [(i, 1) for i in port_identifiers], 'hosts': [(resources['hosts'][0].identifier, 1)] } self.resources = resources self.save() return resources else: return None
def add(username, id, vol_path='', create_time=None): # update metadata with metadata.lock: data = metadata.read_dataset_metadata() if not id: logger.debug('bucket without name.') raise ValueError('bucket without name.') if len(id) < 3: logger.debug('Bucket name cannot be smaller than 3 characters') raise ValueError('Bucket name cannot be smaller than 3 characters') if len(id) > 63: logger.debug('Bucket name cannot be greater than 63 characters') raise ValueError( 'Bucket name cannot be greater than 63 characters') match = re.match(r'[a-zA-Z0-9]+', id) if match: if match.group() != id: logger.debug('Bucket name contains invalid characters') raise ValueError('Bucket name contains invalid characters') else: logger.debug('Bucket name contains invalid characters') raise ValueError('Bucket name contains invalid characters') if id in data['datasets']: logger.debug('bucket {} already exist'.format(id)) raise Exception('Dataset {} already exists'.format(id)) if create_time: c_time = create_time else: c_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S") dst_path = os.path.join(DATA_ROOT, "dataset", id) if vol_path and not os.path.exists(vol_path): raise Exception('local path {} not exists'.format(vol_path)) if os.path.exists(vol_path): logger.debug("create link {} -> {}".format(vol_path, dst_path)) os.symlink(vol_path, dst_path) if not os.path.exists(dst_path): check_output("mc mb mlsteam/{}".format(id), shell=True) dataset = { 'id': id, 'name': id, 'description': '', 'username': username, 'from': 'CLI', 'source': 'local', 'type': 'file', 'size': 0, 'data_dir': get_dataset_path(id), 'create_time': c_time, 'vol_path': dst_path if vol_path else '', } data['datasets'][id] = dataset metadata.save_dataset_metadata(data)
def run(self, resources): self.before_run() env = os.environ.copy() env['PYTHONPATH'] = os.pathsep.join( ['.', self._dir, env.get('PYTHONPATH', '')] + sys.path) gpus = [i for (i, _) in resources['gpus']] env['CUDA_VISIBLE_DEVICES'] = ','.join(str(g) for g in gpus) root = os.path.dirname(os.path.abspath(myelindl.__file__)) args = [ sys.executable, '-m', os.path.join(root, 'tools', 'unix_server'), '--checkpoint-path=%s' % self.checkpoint_path, '--job-dir=%s' % self._dir, ] logger.debug("run args: {}".format(args)) self.p = subprocess.Popen( args, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, cwd=self._dir, close_fds=True, env=env, ) try: sigterm_time = None # When was the SIGTERM signal sent sigterm_timeout = 120 # When should the SIGKILL signal be sent while self.p.poll() is None: for line in utils.nonblocking_readlines(self.p.stdout): # for line in self.p.stdout: if self.aborted.is_set(): if sigterm_time is None: # Attempt graceful shutdown self.p.send_signal(signal.SIGTERM) sigterm_time = time.time() self.status = ABORT break if line is not None: # Remove whitespace line = line.strip() if line: self.output_log.write('%s\n' % line) self.output_log.flush() else: time.sleep(0.05) if sigterm_time is not None and (time.time() - sigterm_time > sigterm_timeout): self.p.send_signal(signal.SIGKILL) logger.debug('Sent SIGKILL to task "%s"' % self.name) time.sleep(0.01) except Exception as e: logger.warning('service exception: {}'.format(e)) self.p.terminate() self.after_run() raise e self.after_run() if self.status != RUN: return False if self.p.returncode != 0: self.returncode = self.p.returncode self.status = ERROR else: self.status = DONE return True
def create(username, checkpoint_id): checkpoint_path = get_checkpoint_path(checkpoint_id) logger.debug("get checkpoint_path: {}".format(checkpoint_path)) svc = Service(username, checkpoint_path) id = svc.id return id
def delete_by_id(job_id): inst = Instance.load(job_id) logger.debug("delete instance {}".format(job_id)) job_dir = os.path.join(RUNS_DIR, job_id) if os.path.exists(job_dir): shutil.rmtree(job_dir)
def run(self, resources): self.before_run() env = os.environ.copy() env['PYTHONPATH'] = os.pathsep.join(['.', self._dir, env.get('PYTHONPATH', '')] + sys.path) gpus = [ i for (i, _) in resources['gpus'] ] env['CUDA_VISIBLE_DEVICES'] = ','.join(str(g) for g in gpus) env['NV_GPU'] = ','.join(str(g) for g in gpus) container_id = None if self.parameters: with open(os.path.join(self._dir, self.PARAMS), 'w') as f: f.write(yaml.dump(dict(self.parameters))) user_args = copy.copy(self.user_args) user_uid = pwd.getpwnam(self.username).pw_uid # prepare docker job parameters job_real_dir = get_real_path(self._dir) args = ['/usr/bin/docker', 'create', '--runtime=nvidia', '--rm'] args.extend(['-e', 'NVIDIA_VISIBLE_DEVICES='+env['NV_GPU']]) args.extend(['-e', 'JOB_DIR=/workspace']) metrics_path = os.path.join("/workspace", self.METRICS) args.extend(['-e', 'METRICS_PATH={}'.format(metrics_path)]) #args.extend(['-u', '{}:{}'.format(user_uid, user_uid)]) if self.dataset_path: dataset_host_path = get_real_path(self.dataset_path) args.extend(['-v', dataset_host_path+':/dataset']) # check for user arguments for n, arg in enumerate(user_args): if "JOB_DIR" in arg: user_args[n] = arg.replace("JOB_DIR", "/workspace") if self.dataset_path: if "DATASET_DIR" in arg: user_args[n] = arg.replace("DATASET_DIR", "/dataset") with open(os.path.join(self._dir, "user_args.sh"), 'w') as f: f.write(' '.join(user_args)) args.extend(['-v', job_real_dir+':/workspace','-w', self._workspace, self.image_tag, "bash", "-x", "/workspace/user_args.sh"]) run_args = ' '.join(args) logger.info("Run {}".format(run_args)) try: output = subprocess.check_output(run_args, stderr=subprocess.STDOUT, shell=True, universal_newlines=True) except subprocess.CalledProcessError as exc: for line in exc.output.split('\n'): self.output_log.write('%s\n' % line) self.after_run() raise exc logger.info("Run output: {}".format(output)) container_id = re.findall(r"^\w+", output)[0][:6] args = ["/usr/bin/docker", "start", "-i", container_id] run_args = ' '.join(args) # End of docker start logger.info("Run {}".format(run_args)) self.p = subprocess.Popen(run_args, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, cwd=self._dir, close_fds=True, env=env, ) try: sigterm_time = None # When was the SIGTERM signal sent sigterm_timeout = 120 # When should the SIGKILL signal be sent while self.p.poll() is None: for line in utils.nonblocking_readlines_p(self.p): if self.aborted.is_set(): if sigterm_time is None: if container_id: subprocess.check_output("/usr/bin/docker stop {}".format(container_id), shell=True) # Attempt graceful shutdown self.p.send_signal(signal.SIGTERM) sigterm_time = time.time() self.status = ABORT break try: subprocess.check_output("/usr/bin/docker ps |grep {}".format(container_id), shell=True) except: ## Bug, docker start may hang while container not exist. break #self.abort() if line is not None: # Remove whitespace line = line.strip().rstrip() if line: self.output_log.write('%s\n' % line.encode("utf-8")) self.output_log.flush() else: time.sleep(0.05) if sigterm_time is not None and (time.time() - sigterm_time > sigterm_timeout): self.p.send_signal(signal.SIGKILL) logger.debug("Sent SIGKILL to task {}".format(self.name)) time.sleep(0.01) except Exception as e: logger.debug("exception, {}, {}".format(e, traceback.format_exc())) self.p.terminate() self.after_run() raise e self.after_run() if self.status != RUN: return False if self.p.returncode != 0: self.returncode = self.p.returncode self.status = ERROR else: self.status = DONE return True
def run(self, resources): logger.info("Run worker!") self.before_run() env = os.environ.copy() gpus = [i for (i, _) in resources['gpus']] ports = [i for (i, _) in resources['ports']] env['NV_GPU'] = ','.join(str(g) for g in gpus) args = [ '/usr/bin/docker', 'create', '--runtime=nvidia', '--rm', '--name', self.id ] args.extend(['-e', 'NVIDIA_VISIBLE_DEVICES=' + env['NV_GPU']]) #user_uid = pwd.getpwnam(self.username).pw_uid #args.extend(['-u', '{}:{}'.format(user_uid, user_uid)]) if ports: for i, port in enumerate(ports): args.extend(['-p', "{}:{}".format(port, self.port_list[i])]) if self.dataset_path: dataset_host_path = get_real_path(self.dataset_path) args.extend(['-v', dataset_host_path + ':/dataset']) if self.user_args: for n, arg in enumerate(self.user_args): if "JOB_ID" in arg: self.user_args[n] = arg.replace("JOB_ID", self.id) home_dir = os.path.join('/home/', self.username) home_real_path = get_real_path(home_dir) args.extend(['-v', home_real_path + ':/workspace', '-w', '/workspace']) args.extend([self.container] + self.user_args) logger.info("Run {}".format(' '.join(args))) try: output = subprocess.check_output(args) except subprocess.CalledProcessError as exc: for line in exc.output.split('\n'): self.output_log.write('%s\n' % line) self.after_run() logger.debug("docker create: {}".format(exc)) raise exc logger.info("Run output: {}".format(output)) container_id = re.findall(r"^\w+", output)[0][:6] args = ["/usr/bin/docker", "start", "-i", container_id] self.p = subprocess.Popen( args, stdout=subprocess.PIPE, stderr=subprocess.PIPE, cwd=self._dir, close_fds=True, env=env, ) start_time = time.time() self.idle_sec = 0 try: sigterm_time = None # When was the SIGTERM signal sent sigterm_timeout = 120 # When should the SIGKILL signal be sent while self.p.poll() is None: for line in utils.nonblocking_readlines_p(self.p): if self.aborted.is_set(): if sigterm_time is None: if container_id: subprocess.check_output( "/usr/bin/docker stop {}".format( container_id), shell=True) # Attempt graceful shutdown self.p.send_signal(signal.SIGTERM) sigterm_time = time.time() self.status = ABORT break try: # check for connection & timeout netst = subprocess.check_output( "/usr/bin/docker exec {} netstat -nat 8888".format( container_id), shell=True) if 'ESTABLISHED' in netst: start_time = time.time() idle_sec = time.time() - start_time if idle_sec > 10: self.idle_sec = idle_sec if idle_sec > IDLE_TIMEOUT: self.abort() logger.info( "jupyter {} timeout, terminating..".format( self.id)) except: pass if line is not None: # Remove whitespace line = line.strip().rstrip() if line: self.output_log.write('%s\n' % line.encode("utf-8")) self.output_log.flush() else: time.sleep(0.05) if sigterm_time is not None and (time.time() - sigterm_time > sigterm_timeout): self.p.send_signal(signal.SIGKILL) logger.debug("Sent SIGKILL to task {}".format(self.name)) time.sleep(0.01) except Exception as e: logger.debug("exception, {}, {}".format(e, traceback.format_exc())) self.p.terminate() self.after_run() raise e self.after_run() if self.status != RUN: return False if self.p.returncode != 0: self.returncode = self.p.returncode self.status = ERROR else: self.status = DONE return True