Exemplo n.º 1
0
def test_atop_raises_on_incorrect_indices():
    x = da.arange(5, chunks=3)
    with pytest.raises(ValueError) as info:
        da.atop(lambda x: x, 'ii', x, 'ii', dtype=int)

    assert 'ii' in str(info.value)
    assert '1' in str(info.value)
Exemplo n.º 2
0
def test_dont_merge_before_reductions():
    x = da.ones(10, chunks=(5,))
    y = da.atop(inc, 'i', x, 'i', dtype=x.dtype)
    z = da.atop(sum, '', y, 'i', dtype=y.dtype)
    w = da.atop(sum, '', z, '', dtype=y.dtype)

    dsk = optimize_atop(w.dask)

    assert len([d for d in dsk.dicts.values() if isinstance(d, TOP)]) == 2

    z.compute()
Exemplo n.º 3
0
def test_dont_merge_before_reductions():
    x = da.ones(10, chunks=(5, ))
    y = da.atop(inc, 'i', x, 'i', dtype=x.dtype)
    z = da.atop(sum, '', y, 'i', dtype=y.dtype)
    w = da.atop(sum, '', z, '', dtype=y.dtype)

    dsk = optimize_atop(w.dask)

    assert len([d for d in dsk.dicts.values() if isinstance(d, TOP)]) == 2

    z.compute()
Exemplo n.º 4
0
def test_validate_top_inputs():
    A = da.random.random((20, 20), chunks=(10, 10))

    with pytest.raises(ValueError) as info:
        da.atop(inc, 'jk', A, 'ij', dtype=A.dtype)

    assert 'unknown dimension' in str(info.value).lower()
    assert 'k' in str(info.value)
    assert 'j' not in str(info.value)

    with pytest.raises(ValueError) as info:
        da.atop(inc, 'ii', A, 'ij', dtype=A.dtype)

    assert 'repeated' in str(info.value).lower()
    assert 'i' in str(info.value)
Exemplo n.º 5
0
def test_validate_top_inputs():
    A = da.random.random((20, 20), chunks=(10, 10))

    with pytest.raises(ValueError) as info:
        da.atop(inc, 'jk', A, 'ij', dtype=A.dtype)

    assert 'unknown dimension' in str(info.value).lower()
    assert 'k' in str(info.value)
    assert 'j' not in str(info.value)

    with pytest.raises(ValueError) as info:
        da.atop(inc, 'ii', A, 'ij', dtype=A.dtype)

    assert 'repeated' in str(info.value).lower()
    assert 'i' in str(info.value)
Exemplo n.º 6
0
 def __init__(self, store, base_name, chunk_info):
     self.store = store
     darray = {}
     extra_flags = []
     for array, info in chunk_info.iteritems():
         array_name = store.join(base_name, array)
         chunk_args = (array_name, info['chunks'], info['dtype'])
         darray[array] = store.get_dask_array(*chunk_args)
         # Find all missing chunks in array and convert to 'data_lost' flags
         has_array = store.has_dask_array(*chunk_args)
         chunks_lost = da.map_blocks(_has_chunk_to_flags,
                                     has_array,
                                     token='missing-chunks-' + array_name,
                                     chunks=info['chunks'],
                                     dtype=np.uint8,
                                     full_chunks=info['chunks'])
         extra_flags.append(chunks_lost)
         extra_flags.append('ijk'[:chunks_lost.ndim])
     vis = darray['correlator_data']
     # Combine original L0 flags with extras (missing chunks per array)
     flags = da.atop(_multi_or_3d,
                     'ijk',
                     darray['flags'],
                     'ijk',
                     *extra_flags,
                     token=store.join(base_name, 'flags_raw'),
                     dtype=np.uint8)
     # Combine low-resolution weights and high-resolution weights_channel
     weights = darray['weights'] * darray['weights_channel'][...,
                                                             np.newaxis]
     VisFlagsWeights.__init__(self, vis, flags, weights, base_name)
Exemplo n.º 7
0
def test_atop_legacy():
    x = da.ones(10, chunks=(5, ))
    with pytest.warns(None):
        y = da.atop(inc, "i", x, "i", dtype=x.dtype)
    z = da.blockwise(inc, "i", x, "i", dtype=x.dtype)
    assert_eq(y, z)
    assert y.name == z.name
Exemplo n.º 8
0
def test_atop_legacy():
    x = da.ones(10, chunks=(5,))
    with pytest.warns(None):
        y = da.atop(inc, 'i', x, 'i', dtype=x.dtype)
    z = da.blockwise(inc, 'i', x, 'i', dtype=x.dtype)
    assert_eq(y, z)
    assert y.name == z.name
Exemplo n.º 9
0
def test_common_token_names_kwargs(name):
    x = np.array(['a', 'bb', 'ccc'], dtype=object)
    d = da.from_array(x, chunks=2)

    result = da.atop(lambda x, y: x + y, 'i', d, 'i', y=name, dtype=object)
    expected = x + name

    assert_eq(result, expected)
Exemplo n.º 10
0
def test_atop_legacy():
    x = da.ones(10, chunks=(5, ))
    with pytest.warns(UserWarning,
                      match="The da.atop function has moved to da.blockwise"):
        y = da.atop(inc, "i", x, "i", dtype=x.dtype)
    z = da.blockwise(inc, "i", x, "i", dtype=x.dtype)
    assert_eq(y, z)
    assert y.name == z.name
Exemplo n.º 11
0
def test_common_token_names_kwargs(name):
    x = np.array(['a', 'bb', 'ccc'], dtype=object)
    d = da.from_array(x, chunks=2)

    result = da.atop(lambda x, y: x + y, 'i', d, 'i', y=name, dtype=object)
    expected = x + name

    assert_eq(result, expected)
Exemplo n.º 12
0
def test_namedtuple(tup):
    A = da.random.random((20, 20), chunks=(10, 10))

    def f(data, x):
        return data

    B = da.atop(f, ("d1", "d2"), A, ("d1", "d2"), x=tup, dtype=A.dtype)

    assert_eq(A, B)
Exemplo n.º 13
0
def _apply_with_dask_atop(func, args, input_dims, output_dims, signature,
                          output_dtypes, output_sizes=None):
    import dask.array as da

    if signature.num_outputs > 1:
        raise NotImplementedError('multiple outputs from apply_ufunc not yet '
                                  "supported with dask='parallelized'")

    if output_dtypes is None:
        raise ValueError('output dtypes (output_dtypes) must be supplied to '
                         "apply_func when using dask='parallelized'")
    if not isinstance(output_dtypes, list):
        raise TypeError('output_dtypes must be a list of objects coercible to '
                        'numpy dtypes, got {}'.format(output_dtypes))
    if len(output_dtypes) != signature.num_outputs:
        raise ValueError('apply_ufunc arguments output_dtypes and '
                         'output_core_dims must have the same length: {} vs {}'
                         .format(len(output_dtypes), signature.num_outputs))
    (dtype,) = output_dtypes

    if output_sizes is None:
        output_sizes = {}

    new_dims = signature.all_output_core_dims - signature.all_input_core_dims
    if any(dim not in output_sizes for dim in new_dims):
        raise ValueError("when using dask='parallelized' with apply_ufunc, "
                         'output core dimensions not found on inputs must '
                         'have explicitly set sizes with ``output_sizes``: {}'
                         .format(new_dims))

    for n, (data, core_dims) in enumerate(
            zip(args, signature.input_core_dims)):
        if isinstance(data, dask_array_type):
            # core dimensions cannot span multiple chunks
            for axis, dim in enumerate(core_dims, start=-len(core_dims)):
                if len(data.chunks[axis]) != 1:
                    raise ValueError(
                        'dimension {!r} on {}th function argument to '
                        "apply_ufunc with dask='parallelized' consists of "
                        'multiple chunks, but is also a core dimension. To '
                        'fix, rechunk into a single dask array chunk along '
                        'this dimension, i.e., ``.rechunk({})``, but beware '
                        'that this may significantly increase memory usage.'
                        .format(dim, n, {dim: -1}))

    (out_ind,) = output_dims

    atop_args = []
    for arg, dims in zip(args, input_dims):
        # skip leading dimensions that are implicitly added by broadcasting
        ndim = getattr(arg, 'ndim', 0)
        trimmed_dims = dims[-ndim:] if ndim else ()
        atop_args.extend([arg, trimmed_dims])

    return da.atop(func, out_ind, *atop_args, dtype=dtype, concatenate=True,
                   new_axes=output_sizes)
Exemplo n.º 14
0
def _apply_with_dask_atop(func, args, input_dims, output_dims, signature,
                          output_dtypes, output_sizes=None):
    import dask.array as da

    if signature.num_outputs > 1:
        raise NotImplementedError('multiple outputs from apply_ufunc not yet '
                                  "supported with dask='parallelized'")

    if output_dtypes is None:
        raise ValueError('output dtypes (output_dtypes) must be supplied to '
                         "apply_func when using dask='parallelized'")
    if not isinstance(output_dtypes, list):
        raise TypeError('output_dtypes must be a list of objects coercible to '
                        'numpy dtypes, got {}'.format(output_dtypes))
    if len(output_dtypes) != signature.num_outputs:
        raise ValueError('apply_ufunc arguments output_dtypes and '
                         'output_core_dims must have the same length: {} vs {}'
                         .format(len(output_dtypes), signature.num_outputs))
    (dtype,) = output_dtypes

    if output_sizes is None:
        output_sizes = {}

    new_dims = signature.all_output_core_dims - signature.all_input_core_dims
    if any(dim not in output_sizes for dim in new_dims):
        raise ValueError("when using dask='parallelized' with apply_ufunc, "
                         'output core dimensions not found on inputs must '
                         'have explicitly set sizes with ``output_sizes``: {}'
                         .format(new_dims))

    for n, (data, core_dims) in enumerate(
            zip(args, signature.input_core_dims)):
        if isinstance(data, dask_array_type):
            # core dimensions cannot span multiple chunks
            for axis, dim in enumerate(core_dims, start=-len(core_dims)):
                if len(data.chunks[axis]) != 1:
                    raise ValueError(
                        'dimension {!r} on {}th function argument to '
                        "apply_ufunc with dask='parallelized' consists of "
                        'multiple chunks, but is also a core dimension. To '
                        'fix, rechunk into a single dask array chunk along '
                        'this dimension, i.e., ``.rechunk({})``, but beware '
                        'that this may significantly increase memory usage.'
                        .format(dim, n, {dim: -1}))

    (out_ind,) = output_dims

    atop_args = []
    for arg, dims in zip(args, input_dims):
        # skip leading dimensions that are implicitly added by broadcasting
        ndim = getattr(arg, 'ndim', 0)
        trimmed_dims = dims[-ndim:] if ndim else ()
        atop_args.extend([arg, trimmed_dims])

    return da.atop(func, out_ind, *atop_args, dtype=dtype, concatenate=True,
                   new_axes=output_sizes)
Exemplo n.º 15
0
def _kmeans_single_lloyd(X, n_clusters, max_iter=300, init='k-means||',
                         verbose=False, x_squared_norms=None,
                         random_state=None, tol=1e-4,
                         precompute_distances=True,
                         oversampling_factor=2,
                         init_max_iter=None):
    centers = k_init(X, n_clusters, init=init,
                     oversampling_factor=oversampling_factor,
                     random_state=random_state, max_iter=init_max_iter)
    dt = X.dtype
    X = X.astype(np.float32)
    P = X.shape[1]
    for i in range(max_iter):
        t0 = tic()
        centers = centers.astype('f4')
        labels, distances = pairwise_distances_argmin_min(
            X, centers, metric='euclidean', metric_kwargs={"squared": True}
        )

        labels = labels.astype(np.int32)
        distances = distances.astype(np.float32)

        r = da.atop(_centers_dense, 'ij',
                    X, 'ij',
                    labels, 'i',
                    n_clusters, None,
                    distances, 'i',
                    adjust_chunks={"i": n_clusters, "j": P},
                    dtype='f8')
        new_centers = da.from_delayed(
            sum(r.to_delayed().flatten()),
            (n_clusters, P),
            X.dtype
        )
        counts = da.bincount(labels, minlength=n_clusters)
        new_centers = new_centers / counts[:, None]
        new_centers, = compute(new_centers)

        # Convergence check
        shift = squared_norm(centers - new_centers)
        t1 = tic()
        logger.info("Lloyd loop %2d. Shift: %0.4f [%.2f s]", i, shift, t1 - t0)
        if shift < tol:
            break
        centers = new_centers

    if shift > 1e-7:
        labels, distances = pairwise_distances_argmin_min(X, centers)
    inertia = distances.astype(dt).sum()
    centers = centers.astype(dt)
    labels = labels.astype(np.int64)

    return labels, inertia, centers, i + 1
Exemplo n.º 16
0
def test_namedtuple(tup):
    A = da.random.random((20, 20), chunks=(10, 10))

    def f(data, x):
        return data

    B = da.atop(f, ("d1", "d2"),
                A, ("d1", "d2"),
                x=tup,
                dtype=A.dtype)

    assert_eq(A, B)
Exemplo n.º 17
0
def dask_relabel_chunks(A):
    """
    Relabel all the the chunks of an input array. It is assumed
    that a `map_blocks` or `map_overlap`, has been previously applied
    over `A`, and each of the chunks of `A` contains a local labelling.

    The function labels all the chunks so that all the labels all
    globally independent.

    E.g. for an input dataset with 3 different chunks of local labels:

    Chunk1: [1, 0, 0] # Maxlabel = 1
    Chunk2: [0, 2, 1] # Maxlabel = 2
    Chunk3: [2, 0, 1] # Maxlabel = 2

    The relabelling of the chunks would look like:

    newChunk1: [1, 0, 0]
    newChunk2: [2, 4, 3] # Chunk2 + Maxlabel(newChunk1) + 1
    newChunk3: [7, 5, 6] # Chunk3 + Maxlabel(newChunk2) + 1

    Parameters
    ----------
    A: dask.Array
        An input array to be relabeled

    Returns
    -------
    B: dask.Array
        Dask array of the same shape, with chunks relabelled.
    """
    inds = tuple(range(A.ndim))
    max_per_block = da.atop(np.max,
                            inds,
                            A,
                            inds,
                            axis=inds,
                            keepdims=True,
                            dtype=A.dtype,
                            adjust_chunks={i: 1
                                           for i in inds})
    block_index_global = da.cumsum(max_per_block.ravel() + 1)

    def relabel(a, block_id=None):
        bid = int(np.ravel_multi_index(block_id, A.numblocks))
        if bid == 0:
            return a
        return a + block_index_global[bid - 1]

    return A.map_blocks(relabel)
Exemplo n.º 18
0
 def isin(element, test_elements, assume_unique=False, invert=False):
     element = da.asarray(element)
     test_elements = da.asarray(test_elements)
     element_axes = tuple(range(element.ndim))
     test_axes = tuple(i + element.ndim for i in range(test_elements.ndim))
     mapped = da.atop(_isin_kernel, element_axes + test_axes,
                      element, element_axes,
                      test_elements, test_axes,
                      adjust_chunks={axis: lambda _: 1
                                     for axis in test_axes},
                      dtype=bool,
                      assume_unique=assume_unique)
     result = mapped.any(axis=test_axes)
     if invert:
         result = ~result
     return result
Exemplo n.º 19
0
 def isin(element, test_elements, assume_unique=False, invert=False):
     element = da.asarray(element)
     test_elements = da.asarray(test_elements)
     element_axes = tuple(range(element.ndim))
     test_axes = tuple(i + element.ndim for i in range(test_elements.ndim))
     mapped = da.atop(
         _isin_kernel,
         element_axes + test_axes,
         element,
         element_axes,
         test_elements,
         test_axes,
         adjust_chunks={axis: lambda _: 1
                        for axis in test_axes},
         dtype=bool,
         assume_unique=assume_unique)
     result = mapped.any(axis=test_axes)
     if invert:
         result = ~result
     return result
Exemplo n.º 20
0
    def inverse_transform(self, X):
        """Inverse ordinal-encode the columns in `X`

        Parameters
        ----------
        X : array or dataframe
            Either the NumPy, dask, or pandas version

        Returns
        -------
        data : DataFrame
            Dask array or dataframe will return a Dask DataFrame.
            Numpy array or pandas dataframe will return a pandas DataFrame
        """
        if isinstance(X, np.ndarray):
            X = pd.DataFrame(X, columns=self.columns_)

        elif isinstance(X, da.Array):
            # later on we concat(..., axis=1), which requires
            # known divisions. Suboptimal, but I think unavoidable.
            unknown = np.isnan(X.chunks[0]).any()
            if unknown:
                lengths = da.atop(len, "i", X[:, 0], "i", dtype="i8").compute()
                X = X.copy()
                chunks = (tuple(lengths), X.chunks[1])
                X._chunks = chunks

            X = dd.from_dask_array(X, columns=self.columns_)

        big = isinstance(X, dd.DataFrame)

        if big:
            chunks = np.array(X.divisions)
            chunks[-1] = chunks[-1] + 1
            chunks = tuple(chunks[1:] - chunks[:-1])

        X = X.copy()
        for col in self.categorical_columns_:
            if _HAS_CTD:
                dtype = self.dtypes_[col]
                categories, ordered = dtype.categories, dtype.ordered
            else:
                categories, ordered = self.dtypes_[col]

            # use .values to avoid warning from pandas
            codes = X[col].values

            if big:
                # dask
                codes._chunks = (chunks,)
                # Need a Categorical.from_codes for dask
                series = (
                    dd.from_dask_array(codes, columns=col)
                    .astype("category")
                    .cat.set_categories(np.arange(len(categories)), ordered=ordered)
                    .cat.rename_categories(categories)
                )
                # Bug in pandas <= 0.20.3 lost name
                if series.name is None:
                    series.name = col
                series.divisions = X.divisions
            else:
                # pandas
                series = pd.Series(
                    pd.Categorical.from_codes(codes, categories, ordered=ordered),
                    name=col,
                )

            X[col] = series

        return X
Exemplo n.º 21
0
def _kmeans_single_lloyd(
    X,
    n_clusters,
    max_iter=300,
    init="k-means||",
    verbose=False,
    x_squared_norms=None,
    random_state=None,
    tol=1e-4,
    precompute_distances=True,
    oversampling_factor=2,
    init_max_iter=None,
):
    centers = k_init(
        X,
        n_clusters,
        init=init,
        oversampling_factor=oversampling_factor,
        random_state=random_state,
        max_iter=init_max_iter,
    )
    dt = X.dtype
    P = X.shape[1]
    for i in range(max_iter):
        with _timer("Lloyd loop %2d." % i, _logger=logger):
            labels, distances = pairwise_distances_argmin_min(
                X,
                centers,
                metric="euclidean",
                metric_kwargs={"squared": True})

            labels = labels.astype(np.int32)
            # distances is always float64, but we need it to match X.dtype
            # for centers_dense, but remain float64 for inertia
            r = da.atop(
                _centers_dense,
                "ij",
                X,
                "ij",
                labels,
                "i",
                n_clusters,
                None,
                distances.astype(X.dtype),
                "i",
                adjust_chunks={
                    "i": n_clusters,
                    "j": P
                },
                dtype=X.dtype,
            )
            new_centers = da.from_delayed(sum(r.to_delayed().flatten()),
                                          (n_clusters, P), X.dtype)
            counts = da.bincount(labels, minlength=n_clusters)
            # Require at least one per bucket, to avoid division by 0.
            counts = da.maximum(counts, 1)
            new_centers = new_centers / counts[:, None]
            new_centers, = compute(new_centers)

            # Convergence check
            shift = squared_norm(centers - new_centers)

            logger.info("Shift: %0.4f", shift)
            if shift < tol:
                break
            centers = new_centers

    if shift > 1e-7:
        labels, distances = pairwise_distances_argmin_min(X, centers)
        labels = labels.astype(np.int32)

    inertia = distances.sum()
    centers = centers.astype(dt)

    return labels, inertia, centers, i + 1
Exemplo n.º 22
0
    def inverse_transform(self, X):
        """Inverse dummy-encode the columns in `X`

        Parameters
        ----------
        X : array or dataframe
            Either the NumPy, dask, or pandas version

        Returns
        -------
        data : DataFrame
            Dask array or dataframe will return a Dask DataFrame.
            Numpy array or pandas dataframe will return a pandas DataFrame
        """
        if isinstance(X, np.ndarray):
            X = pd.DataFrame(X, columns=self.transformed_columns_)

        elif isinstance(X, da.Array):
            # later on we concat(..., axis=1), which requires
            # known divisions. Suboptimal, but I think unavoidable.
            unknown = np.isnan(X.chunks[0]).any()
            if unknown:
                lengths = da.atop(len, 'i', X[:, 0], 'i', dtype='i8').compute()
                X = X.copy()
                chunks = (tuple(lengths), X.chunks[1])
                X._chunks = chunks

            X = dd.from_dask_array(X, columns=self.transformed_columns_)

        big = isinstance(X, dd.DataFrame)

        if big:
            chunks = np.array(X.divisions)
            chunks[-1] = chunks[-1] + 1
            chunks = tuple(chunks[1:] - chunks[:-1])

        non_cat = X[list(self.non_categorical_columns_)]

        cats = []
        for col in self.categorical_columns_:
            slice_ = self.categorical_blocks_[col]
            if _HAS_CTD:
                dtype = self.dtypes_[col]
                categories, ordered = dtype.categories, dtype.ordered
            else:
                categories, ordered = self.dtypes_[col]

            # use .values to avoid warning from pandas
            inds = X[list(X.columns[slice_])].values
            codes = inds.argmax(1)

            if self.drop_first:
                codes += 1
                codes[(inds == 0).all(1)] = 0

            if big:
                # dask
                codes._chunks = (chunks, )
                # Need a Categorical.from_codes for dask
                series = (dd.from_dask_array(
                    codes, columns=col).astype('category').cat.set_categories(
                        np.arange(len(categories)),
                        ordered=ordered).cat.rename_categories(categories))
                # Bug in pandas <= 0.20.3 lost name
                if series.name is None:
                    series.name = col
                series.divisions = X.divisions
            else:
                # pandas
                series = pd.Series(pd.Categorical.from_codes(codes,
                                                             categories,
                                                             ordered=ordered),
                                   name=col)

            cats.append(series)
        if big:
            df = dd.concat([non_cat] + cats, axis=1)[list(self.columns_)]
        else:
            df = pd.concat([non_cat] + cats, axis=1)[self.columns_]
        return df