def initialize(cls, isMaster): cls.shuffleDir = [p for p in env.get('WORKDIR') if os.path.exists(os.path.dirname(p))] if not cls.shuffleDir: return cls.serverUri = env.get('SERVER_URI', 'file://' + cls.shuffleDir[0]) logger.debug("shuffle dir: %s", cls.shuffleDir)
def initialize(cls, isMaster): cls.shuffleDir = [p for p in env.get('WORKDIR') if os.path.exists(os.path.dirname(p))] if not cls.shuffleDir: return cls.serverUri = env.get('SERVER_URI') logger.debug("shuffle dir: %s", cls.shuffleDir)
def __init__(self, isMaster): LocalCacheTracker.__init__(self, isMaster) if isMaster: self.server = CacheTrackerServer(self.locs) self.server.start() addr = self.server.addr env.register('CacheTrackerAddr', addr) else: cachedir = os.path.join(env.get('WORKDIR')[0], 'cache') self.cache = DiskCache(self, cachedir) addr = env.get('CacheTrackerAddr') self.client = CacheTrackerClient(addr)
def __init__(self, isMaster): LocalCacheTracker.__init__(self, isMaster) if isMaster: self.server = CacheTrackerServer(self.locs) self.server.start() addr = self.server.addr env.register("CacheTrackerAddr", addr) else: cachedir = os.path.join(env.get("WORKDIR")[0], "cache") self.cache = DiskCache(self, cachedir) addr = env.get("CacheTrackerAddr") self.client = CacheTrackerClient(addr)
def start(self): if self._started: return self._started = True start_download_manager() self.guide_addr = env.get(GUIDE_ADDR) self.download_addr = env.get(DOWNLOAD_ADDR) self.cache = Cache() self.ctx = zmq.Context() self.shared_uuid_fn_dict = _download_manager.shared_uuid_fn_dict self.shared_uuid_map_dict = _download_manager.shared_uuid_map_dict self.download_cond = _download_manager.download_cond
def get(self, key): p = self.get_path(key) if os.path.exists(p): return self.load(open(p, 'rb')) # load from other node if not env.get('SERVER_URI'): return rdd_id, index = key locs = self.tracker.getCacheUri(rdd_id, index) if not locs: return serve_uri = locs[-1] uri = '%s/cache/%s' % (serve_uri, os.path.basename(p)) try: f = urllib.request.urlopen(uri) except IOError: logger.warning('urlopen cache uri %s failed', uri) raise if f.code == 404: logger.warning('load from cache %s failed', uri) self.tracker.removeHost(rdd_id, index, serve_uri) f.close() return return self.load(f)
def __init__(self): cachedir = os.path.join(env.get("WORKDIR")[0], "cache") self.cache = DiskCache(self, cachedir) self.client = env.trackerClient if env.isMaster: self.locs = env.trackerServer.locs self.rdds = {}
def _get_path(self): dirs = env.get('WORKDIR') if not dirs: raise RuntimeError('No available workdir') path = os.path.join(dirs[0], 'mutable_dict') if os.path.exists(path): return path st = os.statvfs(dirs[0]) ratio = st.f_bfree * 1.0 / st.f_blocks if ratio >= 0.66: mkdir_p(path) return path for d in dirs[1:]: p = os.path.join(d, 'mutable_dict') try: os.makedirs(p) os.symlink(p, path) except OSError as e: pass return path raise RuntimeError('Cannot find suitable workdir')
def run_task(task, aid): try: setproctitle('dpark worker %s: run task %s' % (Script, task)) Accumulator.clear() result = task.run(aid) accUpdate = Accumulator.values() try: flag, data = 0, marshal.dumps(result) except ValueError: flag, data = 1, cPickle.dumps(result) if len(data) > TASK_RESULT_LIMIT: workdir = env.get('WORKDIR') path = os.path.join(workdir, str(task.id) + '.result') with open(path, 'w') as f: f.write(data) data = path flag += 2 setproctitle('dpark worker: idle') return mesos_pb2.TASK_FINISHED, cPickle.dumps( (task.id, Success(), (flag, data), accUpdate), -1) except Exception, e: import traceback msg = traceback.format_exc() setproctitle('dpark worker: idle') return mesos_pb2.TASK_FAILED, cPickle.dumps( (task.id, OtherFailure(msg), None, None), -1)
def __init__(self): cachedir = os.path.join(env.get('WORKDIR')[0], 'cache') self.cache = DiskCache(self, cachedir) self.client = env.trackerClient if env.trackerServer is not None: self.locs = env.trackerServer.locs self.rdds = {}
def getOutputFile(cls, shuffle_id, input_id, output_id, datasize=0): """ datasize < 0: disk first datasize > 0: memfirst datasize = 0: read only, use link """ shuffleDir = env.get('WORKDIR') path = os.path.join(shuffleDir[0], str(shuffle_id), str(input_id)) mkdir_p(path) p = os.path.join(path, str(output_id)) if datasize != 0 and len(shuffleDir) > 1: use_disk = datasize < 0 if datasize > 0: st = os.statvfs(path) free = st.f_bfree * st.f_bsize ratio = st.f_bfree * 1.0 / st.f_blocks use_disk = free < max(datasize, 1 << 30) or ratio < 0.66 if use_disk: d2 = os.path.join( random.choice(shuffleDir[1:]), str(shuffle_id), str(input_id)) mkdir_p(d2) p2 = os.path.join(d2, str(output_id)) if os.path.exists(p): os.remove(p) os.symlink(p2, p) if os.path.islink(p2): os.unlink(p2) # p == p2 return p2 return p
def __init__(self): cachedir = os.path.join(env.get('WORKDIR')[0], 'cache') self.cache = DiskCache(self, cachedir) self.client = env.trackerClient if env.isMaster: self.locs = env.trackerServer.locs self.rdds = {}
def initialize(cls, is_master): Broadcast.initialize(is_master) sock = env.ctx.socket(zmq.REP) sock.setsockopt(zmq.LINGER, 0) port = sock.bind_to_random_port("tcp://0.0.0.0") cls.tracker_addr = 'tcp://%s:%d' % (cls.host, port) def run(): logger.debug("TreeBroadcast tracker started at %s", cls.tracker_addr) while True: uuid = sock.recv_pyobj() obj = cls.guides.get(uuid, '') sock.send_pyobj(obj and (obj.guide_addr, len(obj.blocks))) if not uuid: break sock.close() logger.debug("TreeBroadcast tracker stopped") if is_master: cls.tracker_thread = spawn(run) env.register('TreeBroadcastTrackerAddr', cls.tracker_addr) else: cls.tracker_addr = env.get('TreeBroadcastTrackerAddr') logger.debug("TreeBroadcast initialized")
def run_task(task_data): try: gc.disable() task, ntry = cPickle.loads(decompress(task_data)) setproctitle('dpark worker %s: run task %s' % (Script, task)) Accumulator.clear() result = task.run(ntry) accUpdate = Accumulator.values() if marshalable(result): flag, data = 0, marshal.dumps(result) else: flag, data = 1, cPickle.dumps(result, -1) data = compress(data) if len(data) > TASK_RESULT_LIMIT: workdir = env.get('WORKDIR') name = 'task_%s_%s.result' % (task.id, ntry) path = os.path.join(workdir, name) f = open(path, 'w') f.write(data) f.close() data = LocalFileShuffle.getServerUri() + '/' + name flag += 2 return mesos_pb2.TASK_FINISHED, cPickle.dumps((task.id, Success(), (flag, data), accUpdate), -1) except Exception, e: import traceback msg = traceback.format_exc() return mesos_pb2.TASK_FAILED, cPickle.dumps((task.id, OtherFailure(msg), None, None), -1)
def run_task(task, aid): try: setproctitle('dpark worker %s: run task %s' % (Script, task)) Accumulator.clear() result = task.run(aid) accUpdate = Accumulator.values() try: flag, data = 0, marshal.dumps(result) except ValueError: flag, data = 1, cPickle.dumps(result) if len(data) > TASK_RESULT_LIMIT and env.dfs: workdir = env.get('WORKDIR') path = os.path.join(workdir, str(task.id)+'.result') with open(path, 'w') as f: f.write(data) data = path flag += 2 setproctitle('dpark worker: idle') return mesos_pb2.TASK_FINISHED, cPickle.dumps((task.id, Success(), (flag, data), accUpdate), -1) except Exception, e: import traceback msg = traceback.format_exc() setproctitle('dpark worker: idle') return mesos_pb2.TASK_FAILED, cPickle.dumps((task.id, OtherFailure(msg), None, None), -1)
def _get_path(self): dirs = env.get('WORKDIR') if not dirs: raise RuntimeError('No available workdir') path = os.path.join(dirs[0], 'mutable_dict') if os.path.exists(path): return path st = os.statvfs(dirs[0]) ratio = st.f_bfree * 1.0 / st.f_blocks if ratio >= 0.66: mkdir_p(path) return path for d in dirs[1:]: p = os.path.join(d, 'mutable_dict') try: os.makedirs(p) os.symlink(p, path) except OSError: pass return path raise RuntimeError('Cannot find suitable workdir')
def __init__(self, items): self.bufsize = 4096 * 1024 self.buf = None self.offset = 0 dirs = env.get('WORKDIR') self.path = path = os.path.join( random.choice(dirs[1:]) if dirs[1:] else dirs[0], 'shuffle-%s.tmp.gz' % uuid.uuid4().hex) with atomic_file(path, bufsize=self.bufsize) as f: f = gzip.GzipFile(fileobj=f) items = sorted(items, key=lambda k_v: k_v[0]) try: for i in items: s = marshal.dumps(i) f.write(struct.pack("I", len(s))) f.write(s) self.loads = marshal.loads except Exception: f.rewind() for i in items: s = six.moves.cPickle.dumps(i) f.write(struct.pack("I", len(s))) f.write(s) self.loads = six.moves.cPickle.loads f.close()
def run_task(task, ntry): try: setproctitle('dpark worker %s: run task %s' % (Script, task)) Accumulator.clear() gc.disable() result = task.run(ntry) accUpdate = Accumulator.values() if marshalable(result): flag, data = 0, marshal.dumps(result) else: flag, data = 1, cPickle.dumps(result, -1) data = compress(data) if len(data) > TASK_RESULT_LIMIT: workdir = env.get('WORKDIR') name = 'task_%s_%s.result' % (task.id, ntry) path = os.path.join(workdir, name) f = open(path, 'w') f.write(data) f.close() data = LocalFileShuffle.getServerUri() + '/' + name flag += 2 return mesos_pb2.TASK_FINISHED, cPickle.dumps( (task.id, Success(), (flag, data), accUpdate), -1) except Exception, e: import traceback msg = traceback.format_exc() return mesos_pb2.TASK_FAILED, cPickle.dumps( (task.id, OtherFailure(msg), None, None), -1)
def getOutputFile(cls, shuffle_id, input_id, output_id, datasize=0): """ datasize < 0: disk first datasize > 0: memfirst datasize = 0: read only, use link """ shuffleDir = env.get('WORKDIR') path = os.path.join(shuffleDir[0], str(shuffle_id), str(input_id)) mkdir_p(path) p = os.path.join(path, str(output_id)) if datasize != 0 and len(shuffleDir) > 1: use_disk = datasize < 0 if datasize > 0: st = os.statvfs(path) free = st.f_bfree * st.f_bsize ratio = st.f_bfree * 1.0 / st.f_blocks use_disk = free < max(datasize, 1 << 30) or ratio < 0.66 if use_disk: d2 = os.path.join(random.choice(shuffleDir[1:]), str(shuffle_id), str(input_id)) mkdir_p(d2) p2 = os.path.join(d2, str(output_id)) if os.path.exists(p): os.remove(p) os.symlink(p2, p) if os.path.islink(p2): os.unlink(p2) # p == p2 return p2 return p
def start(self, is_master): if is_master: self.guides = {} self.guide_addr, self.guide_thread = self.start_guide() env.register('BroadcastGuideAddr', self.guide_addr) else: self.guide_addr = env.get('BroadcastGuideAddr') logger.debug("broadcast started: %s", self.guide_addr)
def __init__(self, isMaster): LocalMapOutputTracker.__init__(self, isMaster) if isMaster: self.server = MapOutputTrackerServer(self.serverUris) self.server.start() addr = self.server.addr env.register('MapOutputTrackerAddr', addr) else: addr = env.get('MapOutputTrackerAddr') self.client = MapOutputTrackerClient(addr) logger.debug("MapOutputTracker started")
def _get_path(self): dirs = env.get('WORKDIR') if not dirs: raise Exception('No available workdir') path = os.path.join(dirs[0], 'mutable_dict') if not os.path.exists(path): try: os.mkdir(path) except OSError, e: pass
def getOrCompute(self, rdd, split): key = (rdd.id, split.index) cachedVal = self.cache.get(key) if cachedVal is not None: logger.debug("Found partition in cache! %s", key) return cachedVal logger.debug("partition not in cache, %s", key) r = self.cache.put(key, rdd.compute(split), is_iterator=True) serve_uri = env.get('SERVER_URI') if serve_uri: self.addHost(rdd.id, split.index, serve_uri) return r
def _flush(self): if not self.updated: return updated_keys = {} path = self._get_path() uri = env.get('SERVER_URI') server_uri = '%s/%s' % (uri, os.path.basename(path)) st = os.statvfs(path) ratio = st.f_bfree * 1.0 / st.f_blocks if ratio < 0.66: raise Exception('Insufficient disk space') for k,v in self.updated.items(): key = self._get_key(k) if key in updated_keys: updated_keys[key][k] = v else: updated_keys[key] = {k:v} uid = uuid.uuid4().get_hex() for key, updated in updated_keys.items(): new = self._fetch_missing(key) for k,v in updated.items(): if v is None: new.pop(k) else: new[k] = v filename = '%s_%s_%s' % (key, self.generation, uid) fn = os.path.join(path, filename) if os.path.exists(fn): raise RuntimeError('conflict uuid for mutable_dict') url = '%s/%s' % (server_uri, filename) with open(fn+'.tmp', 'wb+') as f: data = compress(cPickle.dumps(new)) f.write(struct.pack('<I', len(data)+4) + data) os.rename(fn+'.tmp', fn) env.trackerClient.call(AddItemMessage('mutable_dict_new:%s' % key, url)) files = glob.glob(os.path.join(path, '%s_*' % key)) for f in files: if int(f.split('_')[-2]) < self.generation -1: try: os.remove(f) except OSError, e: pass
def _flush(self): if not self.updated: return updated_keys = {} dirname = "mutable_dict" tmppath = env.workdir.alloc_tmp_dir(dirname) path = env.workdir.export(tmppath, dirname) uri = env.get('SERVER_URI') server_uri = '%s/%s' % (uri, os.path.basename(path)) for k, v in self.updated.items(): key = self._get_key(k) if key in updated_keys: updated_keys[key][k] = v else: updated_keys[key] = {k: v} uid = uuid_pkg.uuid4().get_hex() for key, updated in updated_keys.items(): new = self._fetch_missing(key) for k, v in updated.items(): if v is None: new.pop(k) else: new[k] = v filename = '%s_%s_%s' % (key, self.generation, uid) fn = os.path.join(path, filename) if os.path.exists(fn): raise RuntimeError('conflict uuid for mutable_dict') url = '%s/%s' % (server_uri, filename) with atomic_file(fn) as f: data = compress(cPickle.dumps(new)) f.write(struct.pack('<I', len(data) + 4) + data) env.trackerClient.call( AddItemMessage('mutable_dict_new:%s' % key, url)) files = glob.glob(os.path.join(path, '%s-*' % self.uuid)) for f in files: if int(f.split('_')[-2]) < self.generation - 1: try: os.remove(f) except OSError: pass self.updated.clear() self.data = LRUDict(self.cacheLimit)
def _flush(self): if not self.updated: return updated_keys = {} dirname = "mutable_dict" tmppath = env.workdir.alloc_tmp_dir(dirname) path = env.workdir.export(tmppath, dirname) uri = env.get('SERVER_URI') server_uri = '%s/%s' % (uri, os.path.basename(path)) for k, v in self.updated.items(): key = self._get_key(k) if key in updated_keys: updated_keys[key][k] = v else: updated_keys[key] = {k: v} uid = uuid_pkg.uuid4().get_hex() for key, updated in updated_keys.items(): new = self._fetch_missing(key) for k, v in updated.items(): if v is None: new.pop(k) else: new[k] = v filename = '%s_%s_%s' % (key, self.generation, uid) fn = os.path.join(path, filename) if os.path.exists(fn): raise RuntimeError('conflict uuid for mutable_dict') url = '%s/%s' % (server_uri, filename) with atomic_file(fn) as f: data = compress(cPickle.dumps(new)) f.write(struct.pack('<I', len(data) + 4) + data) env.trackerClient.call(AddItemMessage('mutable_dict_new:%s' % key, url)) files = glob.glob(os.path.join(path, '%s-*' % self.uuid)) for f in files: if int(f.split('_')[-2]) < self.generation - 1: try: os.remove(f) except OSError: pass self.updated.clear() self.data = LRUDict(self.cacheLimit)
def start(self): if self._started: return self.manager = manager = Manager() self.shared_uuid_fn_dict = manager.dict() self.shared_uuid_map_dict = manager.dict() self.shared_master_blocks = manager.dict() self.download_cond = Condition() self._started = True self.ctx = zmq.Context() self.host = socket.gethostname() if GUIDE_ADDR not in env.environ: start_guide_manager() self.guide_addr = env.get(GUIDE_ADDR) self.random_inst = random.SystemRandom() self.server_addr, self.server_thread = self.start_server() self.uuid_state_dict = {} self.uuid_map_dict = {} self.work_dirs = env.get('WORKDIR') self.master_broadcast_blocks = {} env.register(DOWNLOAD_ADDR, self.server_addr)
def getOrCompute(self, rdd, split): key = (rdd.id, split.index) cachedVal = self.cache.get(key) if cachedVal is not None: logger.debug("Found partition in cache! %s", key) for i in cachedVal: yield i else: logger.debug("partition not in cache, %s", key) for i in self.cache.put(key, rdd.compute(split), is_iterator=True): yield i serve_uri = env.get('SERVER_URI') if serve_uri: self.addHost(rdd.id, split.index, serve_uri)
def __init__(self, isMaster): LocalCacheTracker.__init__(self, isMaster) if isMaster: self.cache = Cache() else: self.cache = LocalCache(mmapCache).newKeySpace() if isMaster: self.server = CacheTrackerServer(self.locs) self.server.start() addr = self.server.addr env.register('CacheTrackerAddr', addr) else: addr = env.get('CacheTrackerAddr') self.client = CacheTrackerClient(addr)
def getOrCompute(self, rdd, split): key = (rdd.id, split.index) cachedVal = self.cache.get(key) if cachedVal is not None: logger.debug("Found partition in cache! %s", key) for i in cachedVal: yield i else: logger.debug("partition not in cache, %s", key) for i in self.cache.put(key, rdd.compute(split), is_iterator=True): yield i serve_uri = env.get("SERVER_URI") if serve_uri: self.addHost(rdd.id, split.index, serve_uri)
def get(self, key): p = self.get_path(key) if os.path.exists(p): return self.load(open(p, "rb")) # load from other node if not env.get("SERVER_URI"): return rdd_id, index = key locs = self.tracker.getCacheUri(rdd_id, index) if not locs: return serve_uri = locs[-1] uri = "%s/cache/%s" % (serve_uri, os.path.basename(p)) f = urllib.urlopen(uri) if f.code == 404: logger.warning("load from cache %s failed", uri) self.tracker.removeHost(rdd_id, index, serve_uri) f.close() return return self.load(f)
def getOutputFile(cls, shuffle_id, input_id, output_id, datasize=0): shuffleDir = env.get('WORKDIR') path = os.path.join(shuffleDir[0], str(shuffle_id), str(input_id)) mkdir_p(path) p = os.path.join(path, str(output_id)) if datasize > 0 and len(shuffleDir) > 1: # datasize > 0 means its writing st = os.statvfs(path) free = st.f_bfree * st.f_bsize ratio = st.f_bfree * 1.0 / st.f_blocks if free < max(datasize, 1 << 30) or ratio < 0.66: d2 = os.path.join(random.choice(shuffleDir[1:]), str(shuffle_id), str(input_id)) mkdir_p(d2) p2 = os.path.join(d2, str(output_id)) if os.path.exists(p): os.remove(p) os.symlink(p2, p) if os.path.islink(p2): os.unlink(p2) # p == p2 return p2 return p
def getServerUri(cls): return env.get('SERVER_URI')
def get_tmp(cls): dirs = env.get('WORKDIR') d = random.choice(dirs[1:]) if dirs[1:] else dirs[0] mkdir_p(d) return os.path.join(d, 'shuffle-%s.tmp' % uuid.uuid4().hex)
def getExecutorInfo(self, framework_id): info = Dict() info.framework_id.value = framework_id info.command.value = '%s %s' % ( sys.executable, os.path.abspath( os.path.join(os.path.dirname(__file__), 'executor.py'))) info.executor_id.value = env.get('DPARK_ID', 'default') info.command.environment.variables = variables = [] v = Dict() variables.append(v) v.name = 'UID' v.value = str(os.getuid()) v = Dict() variables.append(v) v.name = 'GID' v.value = str(os.getgid()) container_image = self._get_container_image() if container_image: info.container.type = 'DOCKER' info.container.docker.image = container_image info.container.docker.parameters = parameters = [] p = Dict() p.key = 'memory-swap' p.value = '-1' parameters.append(p) info.container.volumes = volumes = [] for path in ['/etc/passwd', '/etc/group']: v = Dict() volumes.append(v) v.host_path = v.container_path = path v.mode = 'RO' for path in conf.MOOSEFS_MOUNT_POINTS: v = Dict() volumes.append(v) v.host_path = v.container_path = path v.mode = 'RW' for path in conf.DPARK_WORK_DIR.split(','): v = Dict() volumes.append(v) v.host_path = v.container_path = path v.mode = 'RW' def _mount_volume(_volumes, _host_path, _container_path, _mode): _v = Dict() _volumes.append(_v) _v.container_path = _container_path _v.mode = _mode if _host_path: _v.host_path = _host_path if self.options.volumes: for volume in self.options.volumes.split(','): fields = volume.split(':') if len(fields) == 3: host_path, container_path, mode = fields mode = mode.upper() assert mode in ('RO', 'RW') elif len(fields) == 2: host_path, container_path = fields mode = 'RW' elif len(fields) == 1: container_path, = fields host_path = '' mode = 'RW' else: raise Exception('cannot parse volume %s', volume) _mount_volume(volumes, host_path, container_path, mode) info.resources = resources = [] mem = Dict() resources.append(mem) mem.name = 'mem' mem.type = 'SCALAR' mem.scalar.value = EXECUTOR_MEMORY cpus = Dict() resources.append(cpus) cpus.name = 'cpus' cpus.type = 'SCALAR' cpus.scalar.value = EXECUTOR_CPUS Script = os.path.realpath(sys.argv[0]) info.name = Script info.data = encode_data( marshal.dumps((Script, os.getcwd(), sys.path, dict(os.environ), self.task_per_node, self.out_logger.addr, self.err_logger.addr, self.logLevel, self.color, env.environ))) assert len(info.data) < (50 << 20), \ 'Info data too large: %s' % (len(info.data),) return info