def test_atop_raises_on_incorrect_indices(): x = da.arange(5, chunks=3) with pytest.raises(ValueError) as info: da.atop(lambda x: x, 'ii', x, 'ii', dtype=int) assert 'ii' in str(info.value) assert '1' in str(info.value)
def test_dont_merge_before_reductions(): x = da.ones(10, chunks=(5,)) y = da.atop(inc, 'i', x, 'i', dtype=x.dtype) z = da.atop(sum, '', y, 'i', dtype=y.dtype) w = da.atop(sum, '', z, '', dtype=y.dtype) dsk = optimize_atop(w.dask) assert len([d for d in dsk.dicts.values() if isinstance(d, TOP)]) == 2 z.compute()
def test_dont_merge_before_reductions(): x = da.ones(10, chunks=(5, )) y = da.atop(inc, 'i', x, 'i', dtype=x.dtype) z = da.atop(sum, '', y, 'i', dtype=y.dtype) w = da.atop(sum, '', z, '', dtype=y.dtype) dsk = optimize_atop(w.dask) assert len([d for d in dsk.dicts.values() if isinstance(d, TOP)]) == 2 z.compute()
def test_validate_top_inputs(): A = da.random.random((20, 20), chunks=(10, 10)) with pytest.raises(ValueError) as info: da.atop(inc, 'jk', A, 'ij', dtype=A.dtype) assert 'unknown dimension' in str(info.value).lower() assert 'k' in str(info.value) assert 'j' not in str(info.value) with pytest.raises(ValueError) as info: da.atop(inc, 'ii', A, 'ij', dtype=A.dtype) assert 'repeated' in str(info.value).lower() assert 'i' in str(info.value)
def __init__(self, store, base_name, chunk_info): self.store = store darray = {} extra_flags = [] for array, info in chunk_info.iteritems(): array_name = store.join(base_name, array) chunk_args = (array_name, info['chunks'], info['dtype']) darray[array] = store.get_dask_array(*chunk_args) # Find all missing chunks in array and convert to 'data_lost' flags has_array = store.has_dask_array(*chunk_args) chunks_lost = da.map_blocks(_has_chunk_to_flags, has_array, token='missing-chunks-' + array_name, chunks=info['chunks'], dtype=np.uint8, full_chunks=info['chunks']) extra_flags.append(chunks_lost) extra_flags.append('ijk'[:chunks_lost.ndim]) vis = darray['correlator_data'] # Combine original L0 flags with extras (missing chunks per array) flags = da.atop(_multi_or_3d, 'ijk', darray['flags'], 'ijk', *extra_flags, token=store.join(base_name, 'flags_raw'), dtype=np.uint8) # Combine low-resolution weights and high-resolution weights_channel weights = darray['weights'] * darray['weights_channel'][..., np.newaxis] VisFlagsWeights.__init__(self, vis, flags, weights, base_name)
def test_atop_legacy(): x = da.ones(10, chunks=(5, )) with pytest.warns(None): y = da.atop(inc, "i", x, "i", dtype=x.dtype) z = da.blockwise(inc, "i", x, "i", dtype=x.dtype) assert_eq(y, z) assert y.name == z.name
def test_atop_legacy(): x = da.ones(10, chunks=(5,)) with pytest.warns(None): y = da.atop(inc, 'i', x, 'i', dtype=x.dtype) z = da.blockwise(inc, 'i', x, 'i', dtype=x.dtype) assert_eq(y, z) assert y.name == z.name
def test_common_token_names_kwargs(name): x = np.array(['a', 'bb', 'ccc'], dtype=object) d = da.from_array(x, chunks=2) result = da.atop(lambda x, y: x + y, 'i', d, 'i', y=name, dtype=object) expected = x + name assert_eq(result, expected)
def test_atop_legacy(): x = da.ones(10, chunks=(5, )) with pytest.warns(UserWarning, match="The da.atop function has moved to da.blockwise"): y = da.atop(inc, "i", x, "i", dtype=x.dtype) z = da.blockwise(inc, "i", x, "i", dtype=x.dtype) assert_eq(y, z) assert y.name == z.name
def test_namedtuple(tup): A = da.random.random((20, 20), chunks=(10, 10)) def f(data, x): return data B = da.atop(f, ("d1", "d2"), A, ("d1", "d2"), x=tup, dtype=A.dtype) assert_eq(A, B)
def _apply_with_dask_atop(func, args, input_dims, output_dims, signature, output_dtypes, output_sizes=None): import dask.array as da if signature.num_outputs > 1: raise NotImplementedError('multiple outputs from apply_ufunc not yet ' "supported with dask='parallelized'") if output_dtypes is None: raise ValueError('output dtypes (output_dtypes) must be supplied to ' "apply_func when using dask='parallelized'") if not isinstance(output_dtypes, list): raise TypeError('output_dtypes must be a list of objects coercible to ' 'numpy dtypes, got {}'.format(output_dtypes)) if len(output_dtypes) != signature.num_outputs: raise ValueError('apply_ufunc arguments output_dtypes and ' 'output_core_dims must have the same length: {} vs {}' .format(len(output_dtypes), signature.num_outputs)) (dtype,) = output_dtypes if output_sizes is None: output_sizes = {} new_dims = signature.all_output_core_dims - signature.all_input_core_dims if any(dim not in output_sizes for dim in new_dims): raise ValueError("when using dask='parallelized' with apply_ufunc, " 'output core dimensions not found on inputs must ' 'have explicitly set sizes with ``output_sizes``: {}' .format(new_dims)) for n, (data, core_dims) in enumerate( zip(args, signature.input_core_dims)): if isinstance(data, dask_array_type): # core dimensions cannot span multiple chunks for axis, dim in enumerate(core_dims, start=-len(core_dims)): if len(data.chunks[axis]) != 1: raise ValueError( 'dimension {!r} on {}th function argument to ' "apply_ufunc with dask='parallelized' consists of " 'multiple chunks, but is also a core dimension. To ' 'fix, rechunk into a single dask array chunk along ' 'this dimension, i.e., ``.rechunk({})``, but beware ' 'that this may significantly increase memory usage.' .format(dim, n, {dim: -1})) (out_ind,) = output_dims atop_args = [] for arg, dims in zip(args, input_dims): # skip leading dimensions that are implicitly added by broadcasting ndim = getattr(arg, 'ndim', 0) trimmed_dims = dims[-ndim:] if ndim else () atop_args.extend([arg, trimmed_dims]) return da.atop(func, out_ind, *atop_args, dtype=dtype, concatenate=True, new_axes=output_sizes)
def _kmeans_single_lloyd(X, n_clusters, max_iter=300, init='k-means||', verbose=False, x_squared_norms=None, random_state=None, tol=1e-4, precompute_distances=True, oversampling_factor=2, init_max_iter=None): centers = k_init(X, n_clusters, init=init, oversampling_factor=oversampling_factor, random_state=random_state, max_iter=init_max_iter) dt = X.dtype X = X.astype(np.float32) P = X.shape[1] for i in range(max_iter): t0 = tic() centers = centers.astype('f4') labels, distances = pairwise_distances_argmin_min( X, centers, metric='euclidean', metric_kwargs={"squared": True} ) labels = labels.astype(np.int32) distances = distances.astype(np.float32) r = da.atop(_centers_dense, 'ij', X, 'ij', labels, 'i', n_clusters, None, distances, 'i', adjust_chunks={"i": n_clusters, "j": P}, dtype='f8') new_centers = da.from_delayed( sum(r.to_delayed().flatten()), (n_clusters, P), X.dtype ) counts = da.bincount(labels, minlength=n_clusters) new_centers = new_centers / counts[:, None] new_centers, = compute(new_centers) # Convergence check shift = squared_norm(centers - new_centers) t1 = tic() logger.info("Lloyd loop %2d. Shift: %0.4f [%.2f s]", i, shift, t1 - t0) if shift < tol: break centers = new_centers if shift > 1e-7: labels, distances = pairwise_distances_argmin_min(X, centers) inertia = distances.astype(dt).sum() centers = centers.astype(dt) labels = labels.astype(np.int64) return labels, inertia, centers, i + 1
def dask_relabel_chunks(A): """ Relabel all the the chunks of an input array. It is assumed that a `map_blocks` or `map_overlap`, has been previously applied over `A`, and each of the chunks of `A` contains a local labelling. The function labels all the chunks so that all the labels all globally independent. E.g. for an input dataset with 3 different chunks of local labels: Chunk1: [1, 0, 0] # Maxlabel = 1 Chunk2: [0, 2, 1] # Maxlabel = 2 Chunk3: [2, 0, 1] # Maxlabel = 2 The relabelling of the chunks would look like: newChunk1: [1, 0, 0] newChunk2: [2, 4, 3] # Chunk2 + Maxlabel(newChunk1) + 1 newChunk3: [7, 5, 6] # Chunk3 + Maxlabel(newChunk2) + 1 Parameters ---------- A: dask.Array An input array to be relabeled Returns ------- B: dask.Array Dask array of the same shape, with chunks relabelled. """ inds = tuple(range(A.ndim)) max_per_block = da.atop(np.max, inds, A, inds, axis=inds, keepdims=True, dtype=A.dtype, adjust_chunks={i: 1 for i in inds}) block_index_global = da.cumsum(max_per_block.ravel() + 1) def relabel(a, block_id=None): bid = int(np.ravel_multi_index(block_id, A.numblocks)) if bid == 0: return a return a + block_index_global[bid - 1] return A.map_blocks(relabel)
def isin(element, test_elements, assume_unique=False, invert=False): element = da.asarray(element) test_elements = da.asarray(test_elements) element_axes = tuple(range(element.ndim)) test_axes = tuple(i + element.ndim for i in range(test_elements.ndim)) mapped = da.atop(_isin_kernel, element_axes + test_axes, element, element_axes, test_elements, test_axes, adjust_chunks={axis: lambda _: 1 for axis in test_axes}, dtype=bool, assume_unique=assume_unique) result = mapped.any(axis=test_axes) if invert: result = ~result return result
def isin(element, test_elements, assume_unique=False, invert=False): element = da.asarray(element) test_elements = da.asarray(test_elements) element_axes = tuple(range(element.ndim)) test_axes = tuple(i + element.ndim for i in range(test_elements.ndim)) mapped = da.atop( _isin_kernel, element_axes + test_axes, element, element_axes, test_elements, test_axes, adjust_chunks={axis: lambda _: 1 for axis in test_axes}, dtype=bool, assume_unique=assume_unique) result = mapped.any(axis=test_axes) if invert: result = ~result return result
def inverse_transform(self, X): """Inverse ordinal-encode the columns in `X` Parameters ---------- X : array or dataframe Either the NumPy, dask, or pandas version Returns ------- data : DataFrame Dask array or dataframe will return a Dask DataFrame. Numpy array or pandas dataframe will return a pandas DataFrame """ if isinstance(X, np.ndarray): X = pd.DataFrame(X, columns=self.columns_) elif isinstance(X, da.Array): # later on we concat(..., axis=1), which requires # known divisions. Suboptimal, but I think unavoidable. unknown = np.isnan(X.chunks[0]).any() if unknown: lengths = da.atop(len, "i", X[:, 0], "i", dtype="i8").compute() X = X.copy() chunks = (tuple(lengths), X.chunks[1]) X._chunks = chunks X = dd.from_dask_array(X, columns=self.columns_) big = isinstance(X, dd.DataFrame) if big: chunks = np.array(X.divisions) chunks[-1] = chunks[-1] + 1 chunks = tuple(chunks[1:] - chunks[:-1]) X = X.copy() for col in self.categorical_columns_: if _HAS_CTD: dtype = self.dtypes_[col] categories, ordered = dtype.categories, dtype.ordered else: categories, ordered = self.dtypes_[col] # use .values to avoid warning from pandas codes = X[col].values if big: # dask codes._chunks = (chunks,) # Need a Categorical.from_codes for dask series = ( dd.from_dask_array(codes, columns=col) .astype("category") .cat.set_categories(np.arange(len(categories)), ordered=ordered) .cat.rename_categories(categories) ) # Bug in pandas <= 0.20.3 lost name if series.name is None: series.name = col series.divisions = X.divisions else: # pandas series = pd.Series( pd.Categorical.from_codes(codes, categories, ordered=ordered), name=col, ) X[col] = series return X
def _kmeans_single_lloyd( X, n_clusters, max_iter=300, init="k-means||", verbose=False, x_squared_norms=None, random_state=None, tol=1e-4, precompute_distances=True, oversampling_factor=2, init_max_iter=None, ): centers = k_init( X, n_clusters, init=init, oversampling_factor=oversampling_factor, random_state=random_state, max_iter=init_max_iter, ) dt = X.dtype P = X.shape[1] for i in range(max_iter): with _timer("Lloyd loop %2d." % i, _logger=logger): labels, distances = pairwise_distances_argmin_min( X, centers, metric="euclidean", metric_kwargs={"squared": True}) labels = labels.astype(np.int32) # distances is always float64, but we need it to match X.dtype # for centers_dense, but remain float64 for inertia r = da.atop( _centers_dense, "ij", X, "ij", labels, "i", n_clusters, None, distances.astype(X.dtype), "i", adjust_chunks={ "i": n_clusters, "j": P }, dtype=X.dtype, ) new_centers = da.from_delayed(sum(r.to_delayed().flatten()), (n_clusters, P), X.dtype) counts = da.bincount(labels, minlength=n_clusters) # Require at least one per bucket, to avoid division by 0. counts = da.maximum(counts, 1) new_centers = new_centers / counts[:, None] new_centers, = compute(new_centers) # Convergence check shift = squared_norm(centers - new_centers) logger.info("Shift: %0.4f", shift) if shift < tol: break centers = new_centers if shift > 1e-7: labels, distances = pairwise_distances_argmin_min(X, centers) labels = labels.astype(np.int32) inertia = distances.sum() centers = centers.astype(dt) return labels, inertia, centers, i + 1
def inverse_transform(self, X): """Inverse dummy-encode the columns in `X` Parameters ---------- X : array or dataframe Either the NumPy, dask, or pandas version Returns ------- data : DataFrame Dask array or dataframe will return a Dask DataFrame. Numpy array or pandas dataframe will return a pandas DataFrame """ if isinstance(X, np.ndarray): X = pd.DataFrame(X, columns=self.transformed_columns_) elif isinstance(X, da.Array): # later on we concat(..., axis=1), which requires # known divisions. Suboptimal, but I think unavoidable. unknown = np.isnan(X.chunks[0]).any() if unknown: lengths = da.atop(len, 'i', X[:, 0], 'i', dtype='i8').compute() X = X.copy() chunks = (tuple(lengths), X.chunks[1]) X._chunks = chunks X = dd.from_dask_array(X, columns=self.transformed_columns_) big = isinstance(X, dd.DataFrame) if big: chunks = np.array(X.divisions) chunks[-1] = chunks[-1] + 1 chunks = tuple(chunks[1:] - chunks[:-1]) non_cat = X[list(self.non_categorical_columns_)] cats = [] for col in self.categorical_columns_: slice_ = self.categorical_blocks_[col] if _HAS_CTD: dtype = self.dtypes_[col] categories, ordered = dtype.categories, dtype.ordered else: categories, ordered = self.dtypes_[col] # use .values to avoid warning from pandas inds = X[list(X.columns[slice_])].values codes = inds.argmax(1) if self.drop_first: codes += 1 codes[(inds == 0).all(1)] = 0 if big: # dask codes._chunks = (chunks, ) # Need a Categorical.from_codes for dask series = (dd.from_dask_array( codes, columns=col).astype('category').cat.set_categories( np.arange(len(categories)), ordered=ordered).cat.rename_categories(categories)) # Bug in pandas <= 0.20.3 lost name if series.name is None: series.name = col series.divisions = X.divisions else: # pandas series = pd.Series(pd.Categorical.from_codes(codes, categories, ordered=ordered), name=col) cats.append(series) if big: df = dd.concat([non_cat] + cats, axis=1)[list(self.columns_)] else: df = pd.concat([non_cat] + cats, axis=1)[self.columns_] return df