示例#1
0
def test_read_bytes_delimited(s3, blocksize):
    _, values = read_bytes(
        "s3://" + test_bucket_name + "/test/accounts*",
        blocksize=blocksize,
        delimiter=b"\n",
    )
    _, values2 = read_bytes(
        "s3://" + test_bucket_name + "/test/accounts*",
        blocksize=blocksize,
        delimiter=b"foo",
    )
    assert [a.key for a in concat(values)] != [b.key for b in concat(values2)]

    results = compute(*concat(values))
    res = [r for r in results if r]
    assert all(r.endswith(b"\n") for r in res)
    ourlines = b"".join(res).split(b"\n")
    testlines = b"".join(files[k] for k in sorted(files)).split(b"\n")
    assert ourlines == testlines

    # delimiter not at the end
    d = b"}"
    _, values = read_bytes(
        "s3://" + test_bucket_name + "/test/accounts*", blocksize=blocksize, delimiter=d
    )
    results = compute(*concat(values))
    res = [r for r in results if r]
    # All should end in } except EOF
    assert sum(r.endswith(b"}") for r in res) == len(res) - 2
    ours = b"".join(res)
    test = b"".join(files[v] for v in sorted(files))
    assert ours == test
def test_read_bytes_delimited():
    with filetexts(files, mode="b"):
        for bs in [5, 15, 45, "1.5 kB"]:
            _, values = read_bytes(".test.accounts*",
                                   blocksize=bs,
                                   delimiter=b"\n")
            _, values2 = read_bytes(".test.accounts*",
                                    blocksize=bs,
                                    delimiter=b"foo")
            assert [a.key for a in concat(values)
                    ] != [b.key for b in concat(values2)]

            results = compute(*concat(values))
            res = [r for r in results if r]
            assert all(r.endswith(b"\n") for r in res)
            ourlines = b"".join(res).split(b"\n")
            testlines = b"".join(files[k] for k in sorted(files)).split(b"\n")
            assert ourlines == testlines

            # delimiter not at the end
            d = b"}"
            _, values = read_bytes(".test.accounts*",
                                   blocksize=bs,
                                   delimiter=d)
            results = compute(*concat(values))
            res = [r for r in results if r]
            # All should end in } except EOF
            assert sum(r.endswith(b"}") for r in res) == len(res) - 2
            ours = b"".join(res)
            test = b"".join(files[v] for v in sorted(files))
            assert ours == test
示例#3
0
def test_modification_time_read_bytes():
    with s3_context("compress", files):
        _, a = read_bytes("s3://compress/test/accounts.*", anon=True)
        _, b = read_bytes("s3://compress/test/accounts.*", anon=True)

        assert [aa._key for aa in concat(a)] == [bb._key for bb in concat(b)]

    with s3_context("compress", valmap(double, files)):
        _, c = read_bytes("s3://compress/test/accounts.*", anon=True)

    assert [aa._key for aa in concat(a)] != [cc._key for cc in concat(c)]
示例#4
0
def test_deterministic_key_names(hdfs):
    data = b"abc\n" * int(1e3)
    fn = "%s/file" % basedir

    with hdfs.open(fn, "wb", replication=1) as fil:
        fil.write(data)

    _, x = read_bytes("hdfs://%s/*" % basedir, delimiter=b"\n", sample=False)
    _, y = read_bytes("hdfs://%s/*" % basedir, delimiter=b"\n", sample=False)
    _, z = read_bytes("hdfs://%s/*" % basedir, delimiter=b"c", sample=False)

    assert [f.key for f in concat(x)] == [f.key for f in concat(y)]
    assert [f.key for f in concat(x)] != [f.key for f in concat(z)]
示例#5
0
    def _dict(self):
        if hasattr(self, "_cached_dict"):
            return self._cached_dict["dsk"]
        else:
            keys = tuple(map(blockwise_token, range(len(self.indices))))
            dsk, _ = fuse(self.dsk, [self.output])
            func = SubgraphCallable(dsk, self.output, keys)

            dsk = make_blockwise_graph(
                func,
                self.output,
                self.output_indices,
                *list(toolz.concat(self.indices)),
                new_axes=self.new_axes,
                numblocks=self.numblocks,
                concatenate=self.concatenate,
                output_blocks=self.output_blocks,
                dims=self.dims,
            )

            # Handle IO Subgraph
            dsk = _inject_io_tasks(dsk, self.io_deps, self.output_indices,
                                   self.new_axes)

            self._cached_dict = {"dsk": dsk}
        return self._cached_dict["dsk"]
示例#6
0
    def _dict(self):
        if hasattr(self, "_cached_dict"):
            return self._cached_dict["dsk"]
        else:
            keys = tuple(map(blockwise_token, range(len(self.indices))))
            dsk, _ = fuse(self.dsk, [self.output])
            func = SubgraphCallable(dsk, self.output, keys)

            dsk = make_blockwise_graph(
                func,
                self.output,
                self.output_indices,
                *list(toolz.concat(self.indices)),
                new_axes=self.new_axes,
                numblocks=self.numblocks,
                concatenate=self.concatenate,
                output_blocks=self.output_blocks,
                dims=self.dims,
            )

            if self.io_subgraph:
                # This is an IO layer.
                for k in dsk:
                    io_key = (self.io_name,) + tuple([k[i] for i in range(1, len(k))])
                    if io_key in dsk[k]:
                        # Inject IO-function arguments into the blockwise graph
                        # as a single (packed) tuple.
                        io_item = self.io_subgraph.get(io_key)
                        io_item = list(io_item[1:]) if len(io_item) > 1 else []
                        new_task = [io_item if v == io_key else v for v in dsk[k]]
                        dsk[k] = tuple(new_task)

            self._cached_dict = {"dsk": dsk}
        return self._cached_dict["dsk"]
示例#7
0
def call_function(func, func_token, args, kwargs, pure=None, nout=None):
    dask_key_name = kwargs.pop("dask_key_name", None)
    pure = kwargs.pop("pure", pure)

    if dask_key_name is None:
        name = "%s-%s" % (
            funcname(func),
            tokenize(func_token, *args, pure=pure, **kwargs),
        )
    else:
        name = dask_key_name

    args2, collections = unzip(map(unpack_collections, args), 2)
    collections = list(concat(collections))

    if kwargs:
        dask_kwargs, collections2 = unpack_collections(kwargs)
        collections.extend(collections2)
        task = (apply, func, list(args2), dask_kwargs)
    else:
        task = (func, ) + args2

    graph = HighLevelGraph.from_collections(name, {name: task},
                                            dependencies=collections)
    nout = nout if nout is not None else None
    return Delayed(name, graph, length=nout)
示例#8
0
def test_repeat():
    x = np.random.random((10, 11, 13))
    d = da.from_array(x, chunks=(4, 5, 3))

    repeats = [0, 1, 2, 5]
    axes = [-3, -2, -1, 0, 1, 2]

    for r in repeats:
        for a in axes:
            assert_eq(x.repeat(r, axis=a), d.repeat(r, axis=a))

    assert_eq(d.repeat(2, 0), da.repeat(d, 2, 0))

    with pytest.raises(NotImplementedError):
        da.repeat(d, np.arange(10))

    with pytest.raises(NotImplementedError):
        da.repeat(d, 2, None)

    with pytest.raises(NotImplementedError):
        da.repeat(d, 2)

    for invalid_axis in [3, -4]:
        with pytest.raises(ValueError):
            da.repeat(d, 2, axis=invalid_axis)

    x = np.arange(5)
    d = da.arange(5, chunks=(2, ))

    assert_eq(x.repeat(3), d.repeat(3))

    for r in [1, 2, 3, 4]:
        assert all(concat(d.repeat(r).chunks))
示例#9
0
文件: blockwise.py 项目: stromal/dask
    def _dict(self):
        if hasattr(self, "_cached_dict"):
            return self._cached_dict["dsk"]
        else:
            keys = tuple(map(blockwise_token, range(len(self.indices))))
            dsk, _ = fuse(self.dsk, [self.output])
            func = SubgraphCallable(dsk, self.output, keys)

            key_deps = {}
            non_blockwise_keys = set()
            dsk = make_blockwise_graph(
                func,
                self.output,
                self.output_indices,
                *list(toolz.concat(self.indices)),
                new_axes=self.new_axes,
                numblocks=self.numblocks,
                concatenate=self.concatenate,
                key_deps=key_deps,
                non_blockwise_keys=non_blockwise_keys,
            )
            self._cached_dict = {
                "dsk": dsk,
                "basic_layer": BasicLayer(dsk, key_deps, non_blockwise_keys),
            }
        return self._cached_dict["dsk"]
示例#10
0
async def assert_balanced(inp, expected, c, s, *workers):
    steal = s.extensions["stealing"]
    steal._pc.stop()

    counter = itertools.count()
    tasks = list(concat(inp))
    data_seq = itertools.count()

    futures = []
    for w, ts in zip(workers, inp):
        for t in sorted(ts, reverse=True):
            if t:
                [dat] = await c.scatter([next(data_seq)], workers=w.address)
                ts = s.tasks[dat.key]
                # Ensure scheduler state stays consistent
                old_nbytes = ts.nbytes
                ts.nbytes = s.bandwidth * t
                for ws in ts.who_has:
                    ws.nbytes += ts.nbytes - old_nbytes
            else:
                dat = 123
            i = next(counter)
            f = c.submit(
                func,
                dat,
                key="%d-%d" % (int(t), i),
                workers=w.address,
                allow_other_workers=True,
                pure=False,
                priority=-i,
            )
            futures.append(f)

    while len(s.rprocessing) < len(futures):
        await asyncio.sleep(0.001)

    for i in range(10):
        steal.balance()

        while steal.in_flight:
            await asyncio.sleep(0.001)

        result = [
            sorted([int(key_split(k)) for k in s.processing[w.address]],
                   reverse=True) for w in workers
        ]

        result2 = sorted(result, reverse=True)
        expected2 = sorted(expected, reverse=True)

        if config.get("pdb-on-err"):
            if result2 != expected2:
                import pdb

                pdb.set_trace()

        if result2 == expected2:
            return
    raise Exception("Expected: {}; got: {}".format(str(expected2),
                                                   str(result2)))
示例#11
0
文件: test_local.py 项目: bigmpc/dask
def test_read_bytes_blocksize_types(blocksize):
    with filetexts(files, mode="b"):
        sample, vals = read_bytes(".test.account*", blocksize=blocksize)
        results = compute(*concat(vals))
        ourlines = b"".join(results).split(b"\n")
        testlines = b"".join(files.values()).split(b"\n")
        assert set(ourlines) == set(testlines)
示例#12
0
文件: test_local.py 项目: bigmpc/dask
def test_compression(fmt, blocksize):
    if fmt not in compress:
        pytest.skip("compression function not provided")
    files2 = valmap(compress[fmt], files)
    with filetexts(files2, mode="b"):
        if fmt and blocksize:
            with pytest.raises(ValueError):
                read_bytes(
                    ".test.accounts.*.json",
                    blocksize=blocksize,
                    delimiter=b"\n",
                    compression=fmt,
                )
            return
        sample, values = read_bytes(
            ".test.accounts.*.json",
            blocksize=blocksize,
            delimiter=b"\n",
            compression=fmt,
        )
        assert sample[:5] == files[sorted(files)[0]][:5]
        assert sample.endswith(b"\n")

        results = compute(*concat(values))
        assert b"".join(results) == b"".join([files[k] for k in sorted(files)])
示例#13
0
def test_compression(s3, fmt, blocksize, s3so):
    if fmt not in compress:
        pytest.skip("compression function not provided")
    s3._cache.clear()
    with s3_context("compress", valmap(compress[fmt], files)):
        if fmt and blocksize:
            with pytest.raises(ValueError):
                read_bytes(
                    "s3://compress/test/accounts.*",
                    compression=fmt,
                    blocksize=blocksize,
                    **s3so
                )
            return
        sample, values = read_bytes(
            "s3://compress/test/accounts.*",
            compression=fmt,
            blocksize=blocksize,
            **s3so
        )
        assert sample.startswith(files[sorted(files)[0]][:10])
        assert sample.endswith(b"\n")

        results = compute(*concat(values))
        assert b"".join(results) == b"".join([files[k] for k in sorted(files)])
示例#14
0
    def __dask_distributed_pack__(self, client):
        from distributed.worker import dumps_function
        from distributed.utils import CancelledError
        from distributed.utils_comm import unpack_remotedata

        keys = tuple(map(blockwise_token, range(len(self.indices))))
        dsk, _ = fuse(self.dsk, [self.output])

        dsk = (SubgraphCallable(dsk, self.output, keys), )
        dsk, dsk_unpacked_futures = unpack_remotedata(dsk, byte_keys=True)

        func = dumps_function(dsk[0])
        func_future_args = dsk[1:]

        indices = list(toolz.concat(self.indices))
        indices, indices_unpacked_futures = unpack_remotedata(indices,
                                                              byte_keys=True)

        # Check the legality of the unpacked futures
        for future in itertools.chain(dsk_unpacked_futures,
                                      indices_unpacked_futures):
            if future.client is not client:
                raise ValueError(
                    "Inputs contain futures that were created by another client."
                )
            if stringify(future.key) not in client.futures:
                raise CancelledError(stringify(future.key))

        # All blockwise tasks will depend on the futures in `indices`
        global_dependencies = tuple(
            stringify(f.key) for f in indices_unpacked_futures)

        ret = {
            "output":
            self.output,
            "output_indices":
            self.output_indices,
            "func":
            func,
            "func_future_args":
            func_future_args,
            "global_dependencies":
            global_dependencies,
            "indices":
            indices,
            "numblocks":
            self.numblocks,
            "concatenate":
            self.concatenate,
            "new_axes":
            self.new_axes,
            "io_subgraph":
            (self.io_name, self.io_subgraph) if self.io_name else (None, None),
            "output_blocks":
            self.output_blocks,
            "dims":
            self.dims,
        }
        return ret
示例#15
0
文件: test_local.py 项目: bigmpc/dask
def test_names():
    with filetexts(files, mode="b"):
        _, a = read_bytes(".test.accounts.*")
        _, b = read_bytes(".test.accounts.*")
        a = list(concat(a))
        b = list(concat(b))

        assert [aa._key for aa in a] == [bb._key for bb in b]

        sleep(1)
        for fn in files:
            with open(fn, "ab") as f:
                f.write(b"x")

        _, c = read_bytes(".test.accounts.*")
        c = list(concat(c))
        assert [aa._key for aa in a] != [cc._key for cc in c]
示例#16
0
文件: blockwise.py 项目: stromal/dask
def broadcast_dimensions(argpairs,
                         numblocks,
                         sentinels=(1, (1, )),
                         consolidate=None):
    """Find block dimensions from arguments

    Parameters
    ----------
    argpairs: iterable
        name, ijk index pairs
    numblocks: dict
        maps {name: number of blocks}
    sentinels: iterable (optional)
        values for singleton dimensions
    consolidate: func (optional)
        use this to reduce each set of common blocks into a smaller set

    Examples
    --------
    >>> argpairs = [('x', 'ij'), ('y', 'ji')]
    >>> numblocks = {'x': (2, 3), 'y': (3, 2)}
    >>> broadcast_dimensions(argpairs, numblocks)
    {'i': 2, 'j': 3}

    Supports numpy broadcasting rules

    >>> argpairs = [('x', 'ij'), ('y', 'ij')]
    >>> numblocks = {'x': (2, 1), 'y': (1, 3)}
    >>> broadcast_dimensions(argpairs, numblocks)
    {'i': 2, 'j': 3}

    Works in other contexts too

    >>> argpairs = [('x', 'ij'), ('y', 'ij')]
    >>> d = {'x': ('Hello', 1), 'y': (1, (2, 3))}
    >>> broadcast_dimensions(argpairs, d)
    {'i': 'Hello', 'j': (2, 3)}
    """
    # List like [('i', 2), ('j', 1), ('i', 1), ('j', 2)]
    argpairs2 = [(a, ind) for a, ind in argpairs if ind is not None]
    L = toolz.concat([
        zip(inds, dims) for (x, inds), (x, dims) in toolz.join(
            toolz.first, argpairs2, toolz.first, numblocks.items())
    ])

    g = toolz.groupby(0, L)
    g = dict((k, set([d for i, d in v])) for k, v in g.items())

    g2 = dict(
        (k, v - set(sentinels) if len(v) > 1 else v) for k, v in g.items())

    if consolidate:
        return toolz.valmap(consolidate, g2)

    if g2 and not set(map(len, g2.values())) == set([1]):
        raise ValueError("Shapes do not align %s" % g)

    return toolz.valmap(toolz.first, g2)
示例#17
0
文件: chunk.py 项目: m-rossi/dask
def coarsen(reduction, x, axes, trim_excess=False, **kwargs):
    """Coarsen array by applying reduction to fixed size neighborhoods

    Parameters
    ----------
    reduction: function
        Function like np.sum, np.mean, etc...
    x: np.ndarray
        Array to be coarsened
    axes: dict
        Mapping of axis to coarsening factor

    Examples
    --------
    >>> x = np.array([1, 2, 3, 4, 5, 6])
    >>> coarsen(np.sum, x, {0: 2})
    array([ 3,  7, 11])
    >>> coarsen(np.max, x, {0: 3})
    array([3, 6])

    Provide dictionary of scale per dimension

    >>> x = np.arange(24).reshape((4, 6))
    >>> x
    array([[ 0,  1,  2,  3,  4,  5],
           [ 6,  7,  8,  9, 10, 11],
           [12, 13, 14, 15, 16, 17],
           [18, 19, 20, 21, 22, 23]])

    >>> coarsen(np.min, x, {0: 2, 1: 3})
    array([[ 0,  3],
           [12, 15]])

    You must avoid excess elements explicitly

    >>> x = np.array([1, 2, 3, 4, 5, 6, 7, 8])
    >>> coarsen(np.min, x, {0: 3}, trim_excess=True)
    array([1, 4])
    """
    # Insert singleton dimensions if they don't exist already
    for i in range(x.ndim):
        if i not in axes:
            axes[i] = 1

    if trim_excess:
        ind = tuple(
            slice(0, -(d % axes[i])) if d % axes[i] else slice(None, None)
            for i, d in enumerate(x.shape))
        x = x[ind]

    # (10, 10) -> (5, 2, 5, 2)
    newshape = tuple(
        concat([(x.shape[i] // axes[i], axes[i]) for i in range(x.ndim)]))

    return reduction(x.reshape(newshape),
                     axis=tuple(range(1, x.ndim * 2, 2)),
                     **kwargs)
示例#18
0
def test_open_files_write(s3):
    paths = ["s3://" + test_bucket_name + "/more/" + f for f in files]
    fils = open_files(paths, mode="wb")
    for fil, data in zip(fils, files.values()):
        with fil as f:
            f.write(data)
    sample, values = read_bytes("s3://" + test_bucket_name + "/more/test/accounts.*")
    results = compute(*concat(values))
    assert set(list(files.values())) == set(results)
示例#19
0
文件: utils.py 项目: zuiwanting/dask
def _check_dsk(dsk):
    """ Check that graph is well named and non-overlapping """
    if not isinstance(dsk, HighLevelGraph):
        return

    assert all(isinstance(k, (tuple, str)) for k in dsk.layers)
    freqs = frequencies(concat(dsk.dicts.values()))
    non_one = {k: v for k, v in freqs.items() if v != 1}
    assert not non_one, non_one
示例#20
0
文件: routines.py 项目: tmct/dask
def split_at_breaks(array, breaks, axis=0):
    """ Split an array into a list of arrays (using slices) at the given breaks

    >>> split_at_breaks(np.arange(6), [3, 5])
    [array([0, 1, 2]), array([3, 4]), array([5])]
    """
    padded_breaks = concat([[None], breaks, [None]])
    slices = [slice(i, j) for i, j in sliding_window(2, padded_breaks)]
    preslice = (slice(None), ) * axis
    split_array = [array[preslice + (s, )] for s in slices]
    return split_array
示例#21
0
文件: test_local.py 项目: bigmpc/dask
def test_read_bytes_block():
    with filetexts(files, mode="b"):
        for bs in [5, 15, 45, 1500]:
            sample, vals = read_bytes(".test.account*", blocksize=bs)
            assert list(map(len, vals)) == [(len(v) // bs + 1) for v in files.values()]

            results = compute(*concat(vals))
            assert sum(len(r) for r in results) == sum(len(v) for v in files.values())

            ourlines = b"".join(results).split(b"\n")
            testlines = b"".join(files.values()).split(b"\n")
            assert set(ourlines) == set(testlines)
示例#22
0
def test_read_bytes_block(s3, blocksize):
    _, vals = read_bytes(
        "s3://" + test_bucket_name + "/test/account*", blocksize=blocksize
    )
    assert list(map(len, vals)) == [(len(v) // blocksize + 1) for v in files.values()]

    results = compute(*concat(vals))
    assert sum(len(r) for r in results) == sum(len(v) for v in files.values())

    ourlines = b"".join(results).split(b"\n")
    testlines = b"".join(files.values()).split(b"\n")
    assert set(ourlines) == set(testlines)
示例#23
0
def test_open_files_write(hdfs):
    path = "hdfs://%s/" % basedir
    data = [b"test data %i" % i for i in range(5)]

    files = open_files(path, num=len(data), mode="wb")
    for fil, b in zip(files, data):
        with fil as f:
            f.write(b)

    sample, vals = read_bytes("hdfs://%s/*.part" % basedir)

    (results,) = dask.compute(list(concat(vals)))
    assert data == results
示例#24
0
def test_read_bytes(s3):
    sample, values = read_bytes("s3://" + test_bucket_name + "/test/accounts.*")
    assert isinstance(sample, bytes)
    assert sample[:5] == files[sorted(files)[0]][:5]
    assert sample.endswith(b"\n")

    assert isinstance(values, (list, tuple))
    assert isinstance(values[0], (list, tuple))
    assert hasattr(values[0][0], "dask")

    assert sum(map(len, values)) >= len(files)
    results = compute(*concat(values))
    assert set(results) == set(files.values())
示例#25
0
文件: test_local.py 项目: bigmpc/dask
def test_read_bytes():
    with filetexts(files, mode="b"):
        sample, values = read_bytes(".test.accounts.*")
        assert isinstance(sample, bytes)
        assert sample[:5] == files[sorted(files)[0]][:5]
        assert sample.endswith(b"\n")

        assert isinstance(values, (list, tuple))
        assert isinstance(values[0], (list, tuple))
        assert hasattr(values[0][0], "dask")

        assert sum(map(len, values)) >= len(files)
        results = compute(*concat(values))
        assert set(results) == set(files.values())
示例#26
0
async def test_stress_communication(c, s, *workers):
    s.validate = False  # very slow otherwise
    da = pytest.importorskip("dask.array")
    # Test consumes many file descriptors and can hang if the limit is too low
    resource = pytest.importorskip("resource")
    bump_rlimit(resource.RLIMIT_NOFILE, 8192)

    n = 20
    xs = [da.random.random((100, 100), chunks=(5, 5)) for i in range(n)]
    ys = [x + x.T for x in xs]
    z = da.atop(vsum, "ij", *concat(zip(ys, ["ij"] * n)), dtype="float64")

    future = c.compute(z.sum())

    result = await future
    assert isinstance(result, float)
示例#27
0
 def _dict(self):
     if hasattr(self, "_cached_dict"):
         return self._cached_dict
     else:
         keys = tuple(map(blockwise_token, range(len(self.indices))))
         dsk, _ = fuse(self.dsk, [self.output])
         func = SubgraphCallable(dsk, self.output, keys)
         self._cached_dict = make_blockwise_graph(
             func,
             self.output,
             self.output_indices,
             *list(toolz.concat(self.indices)),
             new_axes=self.new_axes,
             numblocks=self.numblocks,
             concatenate=self.concatenate)
     return self._cached_dict
示例#28
0
async def scatter_to_workers(nthreads,
                             data,
                             rpc=rpc,
                             report=True,
                             serializers=None):
    """ Scatter data directly to workers

    This distributes data in a round-robin fashion to a set of workers based on
    how many cores they have.  nthreads should be a dictionary mapping worker
    identities to numbers of cores.

    See scatter for parameter docstring
    """
    assert isinstance(nthreads, dict)
    assert isinstance(data, dict)

    workers = list(concat([w] * nc for w, nc in nthreads.items()))
    names, data = list(zip(*data.items()))

    worker_iter = drop(_round_robin_counter[0] % len(workers), cycle(workers))
    _round_robin_counter[0] += len(data)

    L = list(zip(worker_iter, names, data))
    d = groupby(0, L)
    d = {
        worker: {key: value
                 for _, key, value in v}
        for worker, v in d.items()
    }

    rpcs = {addr: rpc(addr) for addr in d}
    try:
        out = await All([
            rpcs[address].update_data(data=v,
                                      report=report,
                                      serializers=serializers)
            for address, v in d.items()
        ])
    finally:
        for r in rpcs.values():
            await r.close_rpc()

    nbytes = merge(o["nbytes"] for o in out)

    who_has = {k: [w for w, _, _ in v] for k, v in groupby(1, L).items()}

    return (names, who_has, nbytes)
示例#29
0
文件: blockwise.py 项目: manuels/dask
    def _dict(self):
        if hasattr(self, "_cached_dict"):
            return self._cached_dict["dsk"]
        else:
            keys = tuple(map(blockwise_token, range(len(self.indices))))
            dsk, _ = fuse(self.dsk, [self.output])
            func = SubgraphCallable(dsk, self.output, keys)

            key_deps = {}
            non_blockwise_keys = set()
            dsk = make_blockwise_graph(
                func,
                self.output,
                self.output_indices,
                *list(toolz.concat(self.indices)),
                new_axes=self.new_axes,
                numblocks=self.numblocks,
                concatenate=self.concatenate,
                key_deps=key_deps,
                non_blockwise_keys=non_blockwise_keys,
            )

            if self.io_subgraph:
                # This is an IO layer.
                for k in dsk:
                    io_key = (self.io_name, ) + tuple(
                        [k[i] for i in range(1, len(k))])
                    if io_key in dsk[k]:
                        # Inject IO-function arguments into the blockwise graph
                        # as a single (packed) tuple.
                        io_item = self.io_subgraph.get(io_key)
                        io_item = list(io_item[1:]) if len(io_item) > 1 else []
                        new_task = [
                            io_item if v == io_key else v for v in dsk[k]
                        ]
                        dsk[k] = tuple(new_task)

                # Clear IO "placeholder" dependencies
                for k in key_deps:
                    if k[0] == self.output:
                        key_deps[k] = set()

            self._cached_dict = {
                "dsk": dsk,
                "basic_layer": BasicLayer(dsk, key_deps, non_blockwise_keys),
            }
        return self._cached_dict["dsk"]
示例#30
0
def test_read_text(fmt, bs, encoding, include_path):
    if fmt not in utils.compress:
        pytest.skip("compress function not provided for %s" % fmt)
    compress = utils.compress[fmt]
    files2 = {k: compress(v.encode(encoding)) for k, v in files.items()}
    with filetexts(files2, mode="b"):
        b = read_text(".test.accounts.*.json",
                      compression=fmt,
                      blocksize=bs,
                      encoding=encoding)
        (L, ) = compute(b)
        assert "".join(L) == expected

        o = read_text(
            sorted(files),
            compression=fmt,
            blocksize=bs,
            encoding=encoding,
            include_path=include_path,
        )
        b = o.pluck(0) if include_path else o
        (L, ) = compute(b)
        assert "".join(L) == expected
        if include_path:
            (paths, ) = compute(o.pluck(1))
            expected_paths = list(
                concat([[k] * v.count("\n") for k, v in files.items()]))
            assert len(paths) == len(expected_paths)
            for path, expected_path in zip(paths, expected_paths):
                assert path.endswith(expected_path)

        blocks = read_text(
            ".test.accounts.*.json",
            compression=fmt,
            blocksize=bs,
            encoding=encoding,
            collection=False,
        )
        L = compute(*blocks)
        assert "".join(line for block in L for line in block) == expected