예제 #1
0
def test_sanitize_index():
    pd = pytest.importorskip('pandas')
    with pytest.raises(TypeError):
        sanitize_index('Hello!')

    np.testing.assert_equal(sanitize_index(pd.Series([1, 2, 3])), [1, 2, 3])
    np.testing.assert_equal(sanitize_index((1, 2, 3)), [1, 2, 3])
예제 #2
0
파일: test_slicing.py 프로젝트: pitrou/dask
def test_sanitize_index():
    pd = pytest.importorskip('pandas')
    with pytest.raises(TypeError):
        sanitize_index('Hello!')

    assert sanitize_index(pd.Series([1, 2, 3])) == [1, 2, 3]
    assert sanitize_index((1, 2, 3)) == [1, 2, 3]
예제 #3
0
def test_sanitize_index():
    pd = pytest.importorskip('pandas')
    with pytest.raises(TypeError):
        sanitize_index('Hello!')

    assert sanitize_index(pd.Series([1, 2, 3])) == [1, 2, 3]
    assert sanitize_index((1, 2, 3)) == [1, 2, 3]
예제 #4
0
파일: reshape.py 프로젝트: jakirkham/dask
def reshape(x, shape, merge_chunks=True, limit=None):
    """Reshape array to new shape

    Parameters
    ----------
    shape : int or tuple of ints
        The new shape should be compatible with the original shape. If
        an integer, then the result will be a 1-D array of that length.
        One shape dimension can be -1. In this case, the value is
        inferred from the length of the array and remaining dimensions.
    merge_chunks : bool, default True
        Whether to merge chunks using the logic in :meth:`dask.array.rechunk`
        when communication is necessary given the input array chunking and
        the output shape. With ``merge_chunks==False``, the input array will
        be rechunked to a chunksize of 1, which can create very many tasks.
    limit: int (optional)
        The maximum block size to target in bytes. If no limit is provided,
        it defaults to using the ``array.chunk-size`` Dask config value.

    Notes
    -----
    This is a parallelized version of the ``np.reshape`` function with the
    following limitations:

    1.  It assumes that the array is stored in `row-major order`_
    2.  It only allows for reshapings that collapse or merge dimensions like
        ``(1, 2, 3, 4) -> (1, 6, 4)`` or ``(64,) -> (4, 4, 4)``

    .. _`row-major order`: https://en.wikipedia.org/wiki/Row-_and_column-major_order

    When communication is necessary this algorithm depends on the logic within
    rechunk.  It endeavors to keep chunk sizes roughly the same when possible.

    See :ref:`array-chunks.reshaping` for a discussion the tradeoffs of
    ``merge_chunks``.

    See Also
    --------
    dask.array.rechunk
    numpy.reshape
    """
    # Sanitize inputs, look for -1 in shape
    from dask.array.core import PerformanceWarning
    from dask.array.slicing import sanitize_index

    shape = tuple(map(sanitize_index, shape))
    known_sizes = [s for s in shape if s != -1]
    if len(known_sizes) < len(shape):
        if len(shape) - len(known_sizes) > 1:
            raise ValueError("can only specify one unknown dimension")
        # Fastpath for x.reshape(-1) on 1D arrays, allows unknown shape in x
        # for this case only.
        if len(shape) == 1 and x.ndim == 1:
            return x
        missing_size = sanitize_index(x.size / reduce(mul, known_sizes, 1))
        shape = tuple(missing_size if s == -1 else s for s in shape)

    if np.isnan(sum(x.shape)):
        raise ValueError("Array chunk size or shape is unknown. shape: %s\n\n"
                         "Possible solution with x.compute_chunk_sizes()" %
                         str(x.shape))

    if reduce(mul, shape, 1) != x.size:
        raise ValueError("total size of new array must be unchanged")

    if x.shape == shape:
        return x

    meta = meta_from_array(x, len(shape))

    name = "reshape-" + tokenize(x, shape)

    if x.npartitions == 1:
        key = next(flatten(x.__dask_keys__()))
        dsk = {(name, ) + (0, ) * len(shape): (M.reshape, key, shape)}
        chunks = tuple((d, ) for d in shape)
        graph = HighLevelGraph.from_collections(name, dsk, dependencies=[x])
        return Array(graph, name, chunks, meta=meta)

    # Logic or how to rechunk
    din = len(x.shape)
    dout = len(shape)
    if not merge_chunks and din > dout:
        x = x.rechunk({i: 1 for i in range(din - dout)})

    inchunks, outchunks = reshape_rechunk(x.shape, shape, x.chunks)
    # Check output chunks are not too large
    max_chunksize_in_bytes = reduce(
        mul, [max(i) for i in outchunks]) * x.dtype.itemsize

    if limit is None:
        limit = parse_bytes(config.get("array.chunk-size"))
        split = config.get("array.slicing.split-large-chunks", None)
    else:
        limit = parse_bytes(limit)
        split = True

    if max_chunksize_in_bytes > limit:
        if split is None:
            msg = (
                "Reshaping is producing a large chunk. To accept the large\n"
                "chunk and silence this warning, set the option\n"
                "    >>> with dask.config.set(**{'array.slicing.split_large_chunks': False}):\n"
                "    ...     array.reshape(shape)\n\n"
                "To avoid creating the large chunks, set the option\n"
                "    >>> with dask.config.set(**{'array.slicing.split_large_chunks': True}):\n"
                "    ...     array.reshape(shape)"
                "Explictly passing ``limit`` to ``reshape`` will also silence this warning\n"
                "    >>> array.reshape(shape, limit='128 MiB')")
            warnings.warn(msg, PerformanceWarning, stacklevel=6)
        elif split:
            # Leave chunk sizes unaltered where possible
            matching_chunks = Counter(inchunks) & Counter(outchunks)
            chunk_plan = []
            for out in outchunks:
                if matching_chunks[out] > 0:
                    chunk_plan.append(out)
                    matching_chunks[out] -= 1
                else:
                    chunk_plan.append("auto")
            outchunks = normalize_chunks(
                chunk_plan,
                shape=shape,
                limit=limit,
                dtype=x.dtype,
                previous_chunks=inchunks,
            )

    x2 = x.rechunk(inchunks)

    # Construct graph
    in_keys = list(product([x2.name], *[range(len(c)) for c in inchunks]))
    out_keys = list(product([name], *[range(len(c)) for c in outchunks]))
    shapes = list(product(*outchunks))
    dsk = {
        a: (M.reshape, b, shape)
        for a, b, shape in zip(out_keys, in_keys, shapes)
    }

    graph = HighLevelGraph.from_collections(name, dsk, dependencies=[x2])
    return Array(graph, name, outchunks, meta=meta)