示例#1
0
def test():
    from dpark.utils import compress
    import logging
    logging.basicConfig(level=logging.DEBUG)
    from dpark.env import env
    env.start()

    path = LocalFileShuffle.getOutputFile(1, 0, 0)
    d = compress(pickle.dumps({'key': 'value'}, -1))
    f = open(path, 'w')
    f.write(pack_header(len(d), False, False) + d)
    f.close()

    uri = LocalFileShuffle.getServerUri()
    env.mapOutputTracker.registerMapOutputs(1, [uri])
    fetcher = SimpleShuffleFetcher()

    def func(it):
        k, v = next(it)
        assert k == 'key'
        assert v == 'value'

    fetcher.fetch(1, 0, func)

    tracker = MapOutputTracker()
    tracker.registerMapOutputs(2, [None, uri, None, None, None])
    assert tracker.getServerUris(2) == [None, uri, None, None, None]
    tracker.stop()
示例#2
0
def test():
    from dpark.utils import compress
    import logging
    logging.basicConfig(level=logging.DEBUG)
    from dpark.env import env
    env.start()

    path = LocalFileShuffle.getOutputFile(1, 0, 0)
    d = compress(pickle.dumps({'key': 'value'}, -1))
    f = open(path, 'w')
    f.write(pack_header(len(d), False, False) + d)
    f.close()

    uri = LocalFileShuffle.getServerUri()
    env.mapOutputTracker.registerMapOutputs(1, [uri])
    fetcher = SimpleShuffleFetcher()

    def func(it):
        k, v = next(it)
        assert k == 'key'
        assert v == 'value'

    fetcher.fetch(1, 0, func)

    tracker = MapOutputTracker()
    tracker.registerMapOutputs(2, [None, uri, None, None, None])
    assert tracker.getServerUris(2) == [None, uri, None, None, None]
    tracker.stop()
示例#3
0
文件: schedule.py 项目: posens/dpark
    def createTask(self, o, t):
        task = Dict()
        tid = t.try_id
        task.name = 'task %s' % tid
        task.task_id.value = tid
        task.agent_id.value = o.agent_id.value
        task.data = encode_data(compress(cPickle.dumps((t, tid), -1)))
        task.executor = self.executor
        if len(task.data) > 1000 * 1024:
            logger.warning('task too large: %s %d', t, len(task.data))

        assert len(task.data) < (50 << 20), \
            'Task data too large: %s' % (len(task.data),)

        resources = task.resources = []

        cpu = Dict()
        resources.append(cpu)
        cpu.name = 'cpus'
        cpu.type = 'SCALAR'
        cpu.scalar.value = t.cpus

        mem = Dict()
        resources.append(mem)
        mem.name = 'mem'
        mem.type = 'SCALAR'
        mem.scalar.value = t.mem

        cpu = Dict()
        resources.append(cpu)
        cpu.name = 'gpus'
        cpu.type = 'SCALAR'
        cpu.scalar.value = t.gpus

        return task
示例#4
0
文件: task.py 项目: douban/dpark
 def _prepare(self, items):
     items = list(items)
     try:
         if marshalable(items):
             is_marshal, d = True, marshal.dumps(items)
         else:
             is_marshal, d = False, cPickle.dumps(items, -1)
     except ValueError:
         is_marshal, d = False, cPickle.dumps(items, -1)
     data = compress(d)
     size = len(data)
     return (is_marshal, data), size
示例#5
0
 def _prepare(self, items):
     items = list(items)
     try:
         if marshalable(items):
             is_marshal, d = True, marshal.dumps(items)
         else:
             is_marshal, d = False, cPickle.dumps(items, -1)
     except ValueError:
         is_marshal, d = False, cPickle.dumps(items, -1)
     data = compress(d)
     size = len(data)
     return (is_marshal, data), size
示例#6
0
def run_task(task_data):
    try:
        gc.disable()
        task, task_try_id = loads(decompress(task_data))
        ttid = TTID(task_try_id)
        Accumulator.clear()
        result = task.run(ttid.ttid)
        env.task_stats.bytes_max_rss = resource.getrusage(
            resource.RUSAGE_SELF).ru_maxrss * 1024
        accUpdate = Accumulator.values()
        MutableDict.flush()

        if marshalable(result):
            try:
                flag, data = 0, marshal.dumps(result)
            except Exception:
                flag, data = 1, cPickle.dumps(result, -1)

        else:
            flag, data = 1, cPickle.dumps(result, -1)
        data = compress(data)

        if len(data) > TASK_RESULT_LIMIT:
            # shuffle_id start from 1
            swd = ShuffleWorkDir(0, task.id, ttid.task_try)
            tmppath = swd.alloc_tmp(len(data))
            with open(tmppath, 'wb') as f:
                f.write(data)
                f.close()
            path = swd.export(tmppath)
            data = '/'.join([env.server_uri] + path.split('/')[-3:])
            flag += 2

        return TaskState.finished, cPickle.dumps(
            ((flag, data), accUpdate, env.task_stats), -1)
    except FetchFailed as e:
        return TaskState.failed, TaskEndReason.fetch_failed, str(
            e), cPickle.dumps(e)
    except Exception as e:
        import traceback
        msg = traceback.format_exc()
        ename = e.__class__.__name__
        fatal_exceptions = (DparkUserFatalError, ArithmeticError, ValueError,
                            LookupError, SyntaxError, TypeError,
                            AssertionError)
        prefix = "FATAL" if isinstance(e, fatal_exceptions) else "FAILED"
        return TaskState.failed, '{}_EXCEPTION_{}'.format(
            prefix, ename), msg, cPickle.dumps(e)
    finally:
        gc.collect()
        gc.enable()
示例#7
0
    def _flush(self):
        if not self.updated:
            return

        updated_keys = {}
        dirname = "mutable_dict"
        tmppath = env.workdir.alloc_tmp_dir(dirname)
        path = env.workdir.export(tmppath, dirname)
        uri = env.get('SERVER_URI')
        server_uri = '%s/%s' % (uri, os.path.basename(path))

        for k, v in self.updated.items():
            key = self._get_key(k)
            if key in updated_keys:
                updated_keys[key][k] = v
            else:
                updated_keys[key] = {k: v}

        uid = uuid_pkg.uuid4().get_hex()
        for key, updated in updated_keys.items():
            new = self._fetch_missing(key)
            for k, v in updated.items():
                if v is None:
                    new.pop(k)
                else:
                    new[k] = v

            filename = '%s_%s_%s' % (key, self.generation, uid)
            fn = os.path.join(path, filename)
            if os.path.exists(fn):
                raise RuntimeError('conflict uuid for mutable_dict')

            url = '%s/%s' % (server_uri, filename)
            with atomic_file(fn) as f:
                data = compress(cPickle.dumps(new))
                f.write(struct.pack('<I', len(data) + 4) + data)

            env.trackerClient.call(
                AddItemMessage('mutable_dict_new:%s' % key, url))

            files = glob.glob(os.path.join(path, '%s-*' % self.uuid))
            for f in files:
                if int(f.split('_')[-2]) < self.generation - 1:
                    try:
                        os.remove(f)
                    except OSError:
                        pass

        self.updated.clear()
        self.data = LRUDict(self.cacheLimit)
示例#8
0
    def _flush(self):
        if not self.updated:
            return

        updated_keys = {}
        dirname = "mutable_dict"
        tmppath = env.workdir.alloc_tmp_dir(dirname)
        path = env.workdir.export(tmppath, dirname)
        uri = env.get('SERVER_URI')
        server_uri = '%s/%s' % (uri, os.path.basename(path))

        for k, v in self.updated.items():
            key = self._get_key(k)
            if key in updated_keys:
                updated_keys[key][k] = v
            else:
                updated_keys[key] = {k: v}

        uid = uuid_pkg.uuid4().get_hex()
        for key, updated in updated_keys.items():
            new = self._fetch_missing(key)
            for k, v in updated.items():
                if v is None:
                    new.pop(k)
                else:
                    new[k] = v

            filename = '%s_%s_%s' % (key, self.generation, uid)
            fn = os.path.join(path, filename)
            if os.path.exists(fn):
                raise RuntimeError('conflict uuid for mutable_dict')

            url = '%s/%s' % (server_uri, filename)
            with atomic_file(fn) as f:
                data = compress(cPickle.dumps(new))
                f.write(struct.pack('<I', len(data) + 4) + data)

            env.trackerClient.call(AddItemMessage('mutable_dict_new:%s' % key, url))

            files = glob.glob(os.path.join(path, '%s-*' % self.uuid))
            for f in files:
                if int(f.split('_')[-2]) < self.generation - 1:
                    try:
                        os.remove(f)
                    except OSError:
                        pass

        self.updated.clear()
        self.data = LRUDict(self.cacheLimit)
示例#9
0
文件: executor.py 项目: douban/dpark
def run_task(task_data):
    try:
        gc.disable()
        task, task_try_id = loads(decompress(task_data))
        ttid = TTID(task_try_id)
        Accumulator.clear()
        result = task.run(ttid.ttid)
        env.task_stats.bytes_max_rss = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss * 1024
        accUpdate = Accumulator.values()
        MutableDict.flush()

        if marshalable(result):
            try:
                flag, data = 0, marshal.dumps(result)
            except Exception:
                flag, data = 1, cPickle.dumps(result, -1)

        else:
            flag, data = 1, cPickle.dumps(result, -1)
        data = compress(data)

        if len(data) > TASK_RESULT_LIMIT:
            # shuffle_id start from 1
            swd = ShuffleWorkDir(0, task.id, ttid.task_try)
            tmppath = swd.alloc_tmp(len(data))
            with open(tmppath, 'wb') as f:
                f.write(data)
                f.close()
            path = swd.export(tmppath)
            data = '/'.join(
                [env.server_uri] + path.split('/')[-3:]
            )
            flag += 2

        return TaskState.finished, cPickle.dumps(((flag, data), accUpdate, env.task_stats), -1)
    except FetchFailed as e:
        return TaskState.failed, TaskEndReason.fetch_failed, str(e), cPickle.dumps(e)
    except Exception as e:
        import traceback
        msg = traceback.format_exc()
        ename = e.__class__.__name__
        fatal_exceptions = (DparkUserFatalError, ArithmeticError,
                            ValueError, LookupError, SyntaxError,
                            TypeError, AssertionError)
        prefix = "FATAL" if isinstance(e, fatal_exceptions) else "FAILED"
        return TaskState.failed, '{}_EXCEPTION_{}'.format(prefix, ename), msg, cPickle.dumps(e)
    finally:
        gc.collect()
        gc.enable()
示例#10
0
def run_task(task_data):
    try:
        gc.disable()
        task, task_try_id = loads(decompress(task_data))
        ttid = TTID(task_try_id)
        Accumulator.clear()
        result = task.run(ttid.ttid)
        env.task_stats.bytes_max_rss = resource.getrusage(
            resource.RUSAGE_SELF).ru_maxrss * 1024
        accUpdate = Accumulator.values()
        MutableDict.flush()

        if marshalable(result):
            try:
                flag, data = 0, marshal.dumps(result)
            except Exception:
                flag, data = 1, cPickle.dumps(result, -1)

        else:
            flag, data = 1, cPickle.dumps(result, -1)
        data = compress(data)

        if len(data) > TASK_RESULT_LIMIT:
            path = LocalFileShuffle.getOutputFile(0, task.id, ttid.task_try,
                                                  len(data))
            f = open(path, 'wb')
            f.write(data)
            f.close()
            data = '/'.join([LocalFileShuffle.getServerUri()] +
                            path.split('/')[-3:])
            flag += 2

        return TaskState.finished, cPickle.dumps(
            ((flag, data), accUpdate, env.task_stats), -1)
    except FetchFailed as e:
        return TaskState.failed, TaskEndReason.fetch_failed, str(
            e), cPickle.dumps(e)
    except Exception as e:
        import traceback
        msg = traceback.format_exc()
        ename = e.__class__.__name__
        return TaskState.failed, 'FAILED_EXCEPTION_{}'.format(
            ename), msg, cPickle.dumps(e)
    finally:
        gc.collect()
        gc.enable()
示例#11
0
文件: executor.py 项目: posens/dpark
def run_task(task_data):
    try:
        gc.disable()
        task, task_try_id = loads(decompress(task_data))
        ttid = TTID(task_try_id)
        Accumulator.clear()
        result = task.run(ttid.ttid)
        env.task_stats.bytes_max_rss = resource.getrusage(
            resource.RUSAGE_SELF).ru_maxrss * 1024
        accUpdate = Accumulator.values()
        MutableDict.flush()

        if marshalable(result):
            try:
                flag, data = 0, marshal.dumps(result)
            except Exception:
                flag, data = 1, cPickle.dumps(result, -1)

        else:
            flag, data = 1, cPickle.dumps(result, -1)
        data = compress(data)

        if len(data) > TASK_RESULT_LIMIT:
            path = LocalFileShuffle.getOutputFile(0, task.id, ttid.task_try,
                                                  len(data))
            f = open(path, 'wb')
            f.write(data)
            f.close()
            data = '/'.join([LocalFileShuffle.getServerUri()] +
                            path.split('/')[-3:])
            flag += 2

        return 'TASK_FINISHED', cPickle.dumps(
            (Success(), (flag, data), accUpdate, env.task_stats), -1)
    except FetchFailed as e:
        return 'TASK_FAILED', cPickle.dumps((e, None, None, None), -1)
    except:
        import traceback
        msg = traceback.format_exc()
        return 'TASK_FAILED', cPickle.dumps(
            (OtherFailure(msg), None, None, None), -1)
    finally:
        gc.collect()
        gc.enable()
示例#12
0
    def to_blocks(self, uuid, obj):
        try:
            if marshalable(obj):
                buf = marshal.dumps((uuid, obj))
                type_ = MARSHAL_TYPE
            else:
                buf = cPickle.dumps((uuid, obj), -1)
                type_ = PICKLE_TYPE

        except Exception:
            buf = cPickle.dumps((uuid, obj), -1)
            type_ = PICKLE_TYPE

        checksum = binascii.crc32(buf) & 0xFFFF
        stream = struct.pack(self.header_fmt, type_, checksum) + buf
        blockNum = (len(stream) + (BLOCK_SIZE - 1)) >> BLOCK_SHIFT
        blocks = [compress(stream[i * BLOCK_SIZE:(i + 1) * BLOCK_SIZE]) for i in range(blockNum)]
        sizes = [len(block) for block in blocks]
        size_l = accumulate_list(sizes)
        block_map = list(izip(size_l[:-1], sizes))
        return blocks, size_l[-1], block_map
示例#13
0
def write_buf(stream, buf, is_marshal):
    buf = compress(buf)
    size = len(buf)
    stream.write(pack_header(size, is_marshal, True))
    stream.write(buf)
    return size + 4
示例#14
0
def write_buf(stream, buf, is_marshal):
    buf = compress(buf)
    size = len(buf)
    stream.write(pack_header(size, is_marshal, True))
    stream.write(buf)
    return size + 4