Пример #1
0
    def __init__(self,
                 username,
                 dataset_id,
                 input_dir,
                 reader_type,
                 parameters={}):

        logger.debug('Convert inint {} '.format(reader_type))
        id = 'conv-{}-{}-{}'.format(username, reader_type, time.time())
        name = id
        super(ConversionTask, self).__init__(id,
                                             username,
                                             name,
                                             persistent=False)
        self.dataset_id = dataset_id
        self.input_dir = input_dir if input_dir else get_dataset_path(
            self.dataset_id)
        self.reader_type = reader_type
        self.output_dir = self._get_output_dir()
        self.parameters = parameters
        self.metadata = {}
        self.error_message = []
        self.skip = reader_type in SKIP_CONVERT
        logger.debug('Convert init finish {} '.format(self.reader_type))
        self.status = INIT
Пример #2
0
def pkg_clone(repo_path, parent_dir, child_dir):
    repo_name = _repo_name(repo_path)
    src_to = os.path.join(child_dir, repo_name)
    #if not os.path.exists(src_to):
    #    os.makedirs(src_to)
    src_from = os.path.join(parent_dir, repo_name)
    logger.debug("Copy pkg {} to {}".format(src_from, src_to))
    system_copy(src_from, src_to)
Пример #3
0
def inference(id, image):
    svc = scheduler.get_instance(id)
    # Connect to server by zmq
    try:
        context = zmq.Context()
        socket = context.socket(zmq.REQ)
        address = "ipc://{}/unix.socket".format(svc._dir)
        logger.debug("Connecting to {}".format(address))
        socket.connect(address)
        socket.send(image)
        # waiting for reply objects
        message = socket.recv()
    except Exception as e:
        logger.warning('Inference fail: {}'.format(e))
        return []
    logger.debug("Reply: {}".format(message))
    return json.loads(message)
Пример #4
0
 def save(self):
     try:
         if not os.path.exists(self._dir):
             return
         tmpfile_path = self.path(self.SAVE_INFO + '.tmp')
         with open(tmpfile_path, 'wb') as tmpfile:
             data = json.dumps(self._dict, sort_keys=True, indent=4)
             tmpfile.write(data)
         file_path = self.path(self.SAVE_INFO)
         shutil.move(tmpfile_path, file_path)
     except KeyboardInterrupt:
         pass
     except Exception as e:
         logger.warning('Caught %s while saving run %s: %s' %
                        (type(e).__name__, self.id, e))
         logger.debug(traceback.format_exc(e))
     return False
Пример #5
0
def get_value_decorator(config, conf_sess_name):

    logger.debug("config = {}, \nconf_sess_name = {}".format(
        config, conf_sess_name))

    def decorator(f):
        @wraps(f)
        def wrapper(*args, **kwargs):
            try:
                # check the config session existence
                if config and config[conf_sess_name]:
                    return f(*args, **kwargs)
            except KeyError as e:
                logger.warning('No such config session in myeconfig')

        return wrapper

    return decorator
Пример #6
0
def new(username, name, image_tag, dataset_path, user_args,
        num_gpu, project, repo_path, parameters, parent):
    while True:
        id = RUNS_PREFIX + JOBS_PREFIX + str(uuid.uuid4()).replace('-', '')[:8]
        job_dir = os.path.join(RUNS_DIR, id)
        if not os.path.exists(job_dir):
            break
    try:
        args = user_args.split()
        for n, arg in enumerate(args):
            if arg.startswith("bk/"):
                args[n] = arg.replace('bk/', '/data/dataset/')
        if dataset_path:
            if dataset_path.startswith("bk/"):
                dataset_path = dataset_path.replace('bk/', '/data/dataset/')
            if not os.path.exists(dataset_path):
                raise ValueError("Cannot find dataset {}".format(dataset_path))
        inst = Instance(id=id,
                        username=username,
                        name=name,
                        image_tag=image_tag,
                        dataset_path=dataset_path,
                        user_args=args,
                        num_gpu=num_gpu,
                        project=project,
                        status_history=[],
                        repo_path=repo_path,
                        parameters=parameters,
                        parent=parent,
                        child=[])
        logger.debug("Create instance {}".format(inst.id))
    except Exception as e:
        logger.warning('Caught %s while creating instance %s: %s' % (type(e).__name__, id, e))
        logger.debug(traceback.format_exc(e))
        raise e
    return inst
Пример #7
0
    def run(self, resources):
        if self.skip:
            self.metadata.update({'status': 'Done'})
            self.metadata.update({'data': 0})
            add_tag(self.dataset_id, {
                self.reader_type: self.metadata,
            })
            self.status = DONE
            return

        args = self._task_arguments()
        self.p = subprocess.Popen(
            args,
            stdout=subprocess.PIPE,
            stderr=subprocess.STDOUT,
            close_fds=True,
        )

        unknown_output = []
        self.before_run()
        try:
            while self.p.poll() is None:
                for line in utils.nonblocking_readlines(self.p.stdout):
                    if self.aborted.is_set():
                        self.p.send_signal(signal.SIGKILL)
                    if line is not None:
                        self.output_log.write(line)
                        line = line.strip()
                    if line:
                        if not self.process_output(line):
                            unknown_output.append(line)
                        else:
                            time.sleep(0.05)
                    time.sleep(0.01)  # do not remove this line.
        except Exception as e:
            self.p.terminate()
            self.status = ERROR
            logger.warning('Convert dataset {} to {} fail. {}'.format(
                self.dataset_id, self.reader_type, str(e)))
            logger.debug(traceback.format_exc(e))
            logger.debug(unknown_output)
            raise
        finally:
            self.after_run()

        if self.p.returncode != 0:
            self.status = ERROR
            logger.warning(
                'Convert dataset {} to {} fail. return code {}, {}'.format(
                    self.dataset_id, self.reader_type, self.p.returncode,
                    self.error_message))
            logger.debug(unknown_output)
        else:
            self.status = DONE
Пример #8
0
    def offer_resources(self, resources):
        port_identifiers = []
        if self.port_list:
            for resource in resources['ports']:
                if resource.remaining() >= 1:
                    # check for port availability
                    cmd = 'docker run --rm -p {}:8888 {} bash'.format(
                        resource.identifier, DEFAULT_IMG)
                    ret = os.system(cmd)
                    if ret == 0:
                        port_identifiers.append(resource.identifier)
                if len(port_identifiers) == len(self.port_list):
                    break
        logger.debug("port_identifiers: {}".format(port_identifiers))

        gpu_identifiers = []
        if self.num_gpu:
            for resource in resources['gpus']:
                if resource.remaining() >= 1:
                    gpu_identifiers.append(resource.identifier)
                if len(gpu_identifiers) == self.num_gpu:
                    break
        logger.debug("gpu_identifiers: {}".format(gpu_identifiers))

        if (len(port_identifiers) == len(self.port_list)) and \
           (len(gpu_identifiers) == self.num_gpu):
            logger.debug("return resources.. ")
            resources = {
                'gpus': [(i, 1) for i in gpu_identifiers],
                'ports': [(i, 1) for i in port_identifiers],
                'hosts': [(resources['hosts'][0].identifier, 1)]
            }
            self.resources = resources
            self.save()
            return resources
        else:
            return None
Пример #9
0
def add(username, id, vol_path='', create_time=None):
    # update metadata
    with metadata.lock:
        data = metadata.read_dataset_metadata()
        if not id:
            logger.debug('bucket without name.')
            raise ValueError('bucket without name.')
        if len(id) < 3:
            logger.debug('Bucket name cannot be smaller than 3 characters')
            raise ValueError('Bucket name cannot be smaller than 3 characters')
        if len(id) > 63:
            logger.debug('Bucket name cannot be greater than 63 characters')
            raise ValueError(
                'Bucket name cannot be greater than 63 characters')
        match = re.match(r'[a-zA-Z0-9]+', id)
        if match:
            if match.group() != id:
                logger.debug('Bucket name contains invalid characters')
                raise ValueError('Bucket name contains invalid characters')
        else:
            logger.debug('Bucket name contains invalid characters')
            raise ValueError('Bucket name contains invalid characters')
        if id in data['datasets']:
            logger.debug('bucket {} already exist'.format(id))
            raise Exception('Dataset {} already exists'.format(id))
        if create_time:
            c_time = create_time
        else:
            c_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
        dst_path = os.path.join(DATA_ROOT, "dataset", id)
        if vol_path and not os.path.exists(vol_path):
            raise Exception('local path {} not exists'.format(vol_path))
        if os.path.exists(vol_path):
            logger.debug("create link {} -> {}".format(vol_path, dst_path))
            os.symlink(vol_path, dst_path)
        if not os.path.exists(dst_path):
            check_output("mc mb mlsteam/{}".format(id), shell=True)
        dataset = {
            'id': id,
            'name': id,
            'description': '',
            'username': username,
            'from': 'CLI',
            'source': 'local',
            'type': 'file',
            'size': 0,
            'data_dir': get_dataset_path(id),
            'create_time': c_time,
            'vol_path': dst_path if vol_path else '',
        }
        data['datasets'][id] = dataset
        metadata.save_dataset_metadata(data)
Пример #10
0
    def run(self, resources):
        self.before_run()
        env = os.environ.copy()
        env['PYTHONPATH'] = os.pathsep.join(
            ['.', self._dir, env.get('PYTHONPATH', '')] + sys.path)
        gpus = [i for (i, _) in resources['gpus']]
        env['CUDA_VISIBLE_DEVICES'] = ','.join(str(g) for g in gpus)
        root = os.path.dirname(os.path.abspath(myelindl.__file__))
        args = [
            sys.executable,
            '-m',
            os.path.join(root, 'tools', 'unix_server'),
            '--checkpoint-path=%s' % self.checkpoint_path,
            '--job-dir=%s' % self._dir,
        ]
        logger.debug("run args: {}".format(args))
        self.p = subprocess.Popen(
            args,
            stdout=subprocess.PIPE,
            stderr=subprocess.STDOUT,
            cwd=self._dir,
            close_fds=True,
            env=env,
        )
        try:
            sigterm_time = None  # When was the SIGTERM signal sent
            sigterm_timeout = 120  # When should the SIGKILL signal be sent
            while self.p.poll() is None:
                for line in utils.nonblocking_readlines(self.p.stdout):
                    # for line in self.p.stdout:
                    if self.aborted.is_set():
                        if sigterm_time is None:
                            # Attempt graceful shutdown
                            self.p.send_signal(signal.SIGTERM)
                            sigterm_time = time.time()
                            self.status = ABORT
                        break
                    if line is not None:
                        # Remove whitespace
                        line = line.strip()

                    if line:
                        self.output_log.write('%s\n' % line)
                        self.output_log.flush()
                    else:
                        time.sleep(0.05)
                if sigterm_time is not None and (time.time() - sigterm_time >
                                                 sigterm_timeout):
                    self.p.send_signal(signal.SIGKILL)
                    logger.debug('Sent SIGKILL to task "%s"' % self.name)
                time.sleep(0.01)
        except Exception as e:
            logger.warning('service exception: {}'.format(e))
            self.p.terminate()
            self.after_run()
            raise e

        self.after_run()
        if self.status != RUN:
            return False
        if self.p.returncode != 0:
            self.returncode = self.p.returncode
            self.status = ERROR
        else:
            self.status = DONE
        return True
Пример #11
0
def create(username, checkpoint_id):
    checkpoint_path = get_checkpoint_path(checkpoint_id)
    logger.debug("get checkpoint_path: {}".format(checkpoint_path))
    svc = Service(username, checkpoint_path)
    id = svc.id
    return id
Пример #12
0
def delete_by_id(job_id):
    inst = Instance.load(job_id)
    logger.debug("delete instance {}".format(job_id))
    job_dir = os.path.join(RUNS_DIR, job_id)
    if os.path.exists(job_dir):
        shutil.rmtree(job_dir)
Пример #13
0
    def run(self, resources):
        self.before_run()
        env = os.environ.copy()
        env['PYTHONPATH'] = os.pathsep.join(['.', self._dir, env.get('PYTHONPATH', '')] + sys.path)
        gpus = [ i for (i, _) in resources['gpus'] ]
        env['CUDA_VISIBLE_DEVICES'] = ','.join(str(g) for g in gpus)
        env['NV_GPU'] = ','.join(str(g) for g in gpus)
        container_id = None
        if self.parameters:
            with open(os.path.join(self._dir, self.PARAMS), 'w') as f:
                f.write(yaml.dump(dict(self.parameters)))
        user_args = copy.copy(self.user_args)
        user_uid = pwd.getpwnam(self.username).pw_uid
        # prepare docker job parameters
        job_real_dir = get_real_path(self._dir)
        args = ['/usr/bin/docker', 'create', '--runtime=nvidia', '--rm']
        args.extend(['-e', 'NVIDIA_VISIBLE_DEVICES='+env['NV_GPU']])
        args.extend(['-e', 'JOB_DIR=/workspace'])
        metrics_path = os.path.join("/workspace", self.METRICS)
        args.extend(['-e', 'METRICS_PATH={}'.format(metrics_path)])
        #args.extend(['-u', '{}:{}'.format(user_uid, user_uid)])
        if self.dataset_path:
            dataset_host_path = get_real_path(self.dataset_path)
            args.extend(['-v', dataset_host_path+':/dataset'])
        # check for user arguments
        for n, arg in enumerate(user_args):
            if "JOB_DIR" in arg:
                user_args[n] = arg.replace("JOB_DIR", "/workspace")
            if self.dataset_path:
                if "DATASET_DIR" in arg:
                    user_args[n] = arg.replace("DATASET_DIR", "/dataset")
        with open(os.path.join(self._dir, "user_args.sh"), 'w') as f:
            f.write(' '.join(user_args))
        args.extend(['-v', job_real_dir+':/workspace','-w', self._workspace, self.image_tag, "bash", "-x", "/workspace/user_args.sh"])
        run_args = ' '.join(args)
        logger.info("Run {}".format(run_args))
        try:
            output = subprocess.check_output(run_args, stderr=subprocess.STDOUT,
                                             shell=True, universal_newlines=True)
        except subprocess.CalledProcessError as exc:
            for line in exc.output.split('\n'):
                self.output_log.write('%s\n' % line)
            self.after_run()
            raise exc
        logger.info("Run output: {}".format(output))
        container_id = re.findall(r"^\w+", output)[0][:6]
        args = ["/usr/bin/docker", "start", "-i", container_id]
        run_args = ' '.join(args)
        # End of docker start
        logger.info("Run {}".format(run_args))
        self.p = subprocess.Popen(run_args,
                                  shell=True,
                                  stdout=subprocess.PIPE,
                                  stderr=subprocess.PIPE,
                                  cwd=self._dir,
                                  close_fds=True,
                                  env=env,
                                  )
        try:
            sigterm_time = None  # When was the SIGTERM signal sent
            sigterm_timeout = 120  # When should the SIGKILL signal be sent
            while self.p.poll() is None:
                for line in utils.nonblocking_readlines_p(self.p):
                    if self.aborted.is_set():
                        if sigterm_time is None:
                            if container_id:
                                subprocess.check_output("/usr/bin/docker stop {}".format(container_id), shell=True)
                            # Attempt graceful shutdown
                            self.p.send_signal(signal.SIGTERM)
                            sigterm_time = time.time()
                            self.status = ABORT
                        break
                    try:
                        subprocess.check_output("/usr/bin/docker ps |grep {}".format(container_id), shell=True)
                    except:
                        ## Bug, docker start may hang while container not exist.
                        break
                        #self.abort()
                    if line is not None:
                        # Remove whitespace
                        line = line.strip().rstrip()

                    if line:
                        self.output_log.write('%s\n' % line.encode("utf-8"))
                        self.output_log.flush()
                    else:
                        time.sleep(0.05)
                if sigterm_time is not None and (time.time() - sigterm_time > sigterm_timeout):
                    self.p.send_signal(signal.SIGKILL)
                    logger.debug("Sent SIGKILL to task {}".format(self.name))
                time.sleep(0.01)
        except Exception as e:
            logger.debug("exception, {}, {}".format(e, traceback.format_exc()))
            self.p.terminate()
            self.after_run()
            raise e

        self.after_run()
        if self.status != RUN:
            return False
        if self.p.returncode != 0:
            self.returncode = self.p.returncode
            self.status = ERROR
        else:
            self.status = DONE
        return True
Пример #14
0
    def run(self, resources):
        logger.info("Run worker!")
        self.before_run()
        env = os.environ.copy()
        gpus = [i for (i, _) in resources['gpus']]
        ports = [i for (i, _) in resources['ports']]
        env['NV_GPU'] = ','.join(str(g) for g in gpus)
        args = [
            '/usr/bin/docker', 'create', '--runtime=nvidia', '--rm', '--name',
            self.id
        ]
        args.extend(['-e', 'NVIDIA_VISIBLE_DEVICES=' + env['NV_GPU']])
        #user_uid = pwd.getpwnam(self.username).pw_uid
        #args.extend(['-u', '{}:{}'.format(user_uid, user_uid)])
        if ports:
            for i, port in enumerate(ports):
                args.extend(['-p', "{}:{}".format(port, self.port_list[i])])
        if self.dataset_path:
            dataset_host_path = get_real_path(self.dataset_path)
            args.extend(['-v', dataset_host_path + ':/dataset'])
        if self.user_args:
            for n, arg in enumerate(self.user_args):
                if "JOB_ID" in arg:
                    self.user_args[n] = arg.replace("JOB_ID", self.id)
        home_dir = os.path.join('/home/', self.username)
        home_real_path = get_real_path(home_dir)
        args.extend(['-v', home_real_path + ':/workspace', '-w', '/workspace'])
        args.extend([self.container] + self.user_args)
        logger.info("Run {}".format(' '.join(args)))
        try:
            output = subprocess.check_output(args)
        except subprocess.CalledProcessError as exc:
            for line in exc.output.split('\n'):
                self.output_log.write('%s\n' % line)
            self.after_run()
            logger.debug("docker create: {}".format(exc))
            raise exc
        logger.info("Run output: {}".format(output))
        container_id = re.findall(r"^\w+", output)[0][:6]
        args = ["/usr/bin/docker", "start", "-i", container_id]
        self.p = subprocess.Popen(
            args,
            stdout=subprocess.PIPE,
            stderr=subprocess.PIPE,
            cwd=self._dir,
            close_fds=True,
            env=env,
        )
        start_time = time.time()
        self.idle_sec = 0
        try:
            sigterm_time = None  # When was the SIGTERM signal sent
            sigterm_timeout = 120  # When should the SIGKILL signal be sent
            while self.p.poll() is None:
                for line in utils.nonblocking_readlines_p(self.p):
                    if self.aborted.is_set():
                        if sigterm_time is None:
                            if container_id:
                                subprocess.check_output(
                                    "/usr/bin/docker stop {}".format(
                                        container_id),
                                    shell=True)
                            # Attempt graceful shutdown
                            self.p.send_signal(signal.SIGTERM)
                            sigterm_time = time.time()
                            self.status = ABORT
                        break
                    try:
                        # check for connection & timeout
                        netst = subprocess.check_output(
                            "/usr/bin/docker exec {} netstat -nat 8888".format(
                                container_id),
                            shell=True)
                        if 'ESTABLISHED' in netst:
                            start_time = time.time()
                        idle_sec = time.time() - start_time
                        if idle_sec > 10:
                            self.idle_sec = idle_sec
                        if idle_sec > IDLE_TIMEOUT:
                            self.abort()
                            logger.info(
                                "jupyter {} timeout, terminating..".format(
                                    self.id))
                    except:
                        pass
                    if line is not None:
                        # Remove whitespace
                        line = line.strip().rstrip()

                    if line:
                        self.output_log.write('%s\n' % line.encode("utf-8"))
                        self.output_log.flush()
                    else:
                        time.sleep(0.05)
                if sigterm_time is not None and (time.time() - sigterm_time >
                                                 sigterm_timeout):
                    self.p.send_signal(signal.SIGKILL)
                    logger.debug("Sent SIGKILL to task {}".format(self.name))
                time.sleep(0.01)
        except Exception as e:
            logger.debug("exception, {}, {}".format(e, traceback.format_exc()))
            self.p.terminate()
            self.after_run()
            raise e

        self.after_run()
        if self.status != RUN:
            return False
        if self.p.returncode != 0:
            self.returncode = self.p.returncode
            self.status = ERROR
        else:
            self.status = DONE
        return True