def dask_to_tfrecords(df, folder, compression_type="GZIP", compression_level=9): """Store Dask.dataframe to TFRecord files.""" makedirs(folder, exist_ok=True) compression_ext = get_compression_ext(compression_type) filenames = [ get_part_filename(i, compression_ext) for i in range(df.npartitions) ] # Also write a meta data file write_meta(df, folder, compression_type) dsk = {} name = "to-tfrecord-" + tokenize(df, folder) part_tasks = [] kwargs = {} for d, filename in enumerate(filenames): dsk[(name, d)] = (apply, pandas_df_to_tfrecords, [ (df._name, d), os.path.join(folder, filename), compression_type, compression_level ], kwargs) part_tasks.append((name, d)) dsk[name] = (lambda x: None, part_tasks) graph = HighLevelGraph.from_collections(name, dsk, dependencies=[df]) out = Delayed(name, graph) out = out.compute() return out
def test_persist_delayed_custom_key(key): d = Delayed(key, {key: "b", "b": 1}) assert d.compute() == 1 dp = d.persist() assert dp.compute() == 1 assert dp.key == key assert dict(dp.dask) == {key: 1}
def test_persist_delayed_rename(key, rename, new_key): d = Delayed(key, {key: 1}) assert d.compute() == 1 rebuild, args = d.__dask_postpersist__() dp = rebuild({new_key: 2}, *args, rename=rename) assert dp.compute() == 2 assert dp.key == new_key assert dict(dp.dask) == {new_key: 2}
def comp(dag, blocker_list): from copy import deepcopy dag = deepcopy(dag) _b = convert_ldicts_to_sdict(blocker_list) last_node = get_lastnode(dict(_b)) if last_node != dag.key: dag = Delayed(last_node, _b) else: dag.dask = _b x = dag.compute() return x
def fit(model, x, y, compute=True, **kwargs): """ Fit scikit learn model against dask arrays Model must support the ``partial_fit`` interface for online or batch learning. This method will be called on dask arrays in sequential order. Ideally your rows are independent and identically distributed. Parameters ---------- model: sklearn model Any model supporting partial_fit interface x: dask Array Two dimensional array, likely tall and skinny y: dask Array One dimensional array with same chunks as x's rows kwargs: options to pass to partial_fit Examples -------- >>> import dask.array as da >>> X = da.random.random((10, 3), chunks=(5, 3)) >>> y = da.random.randint(0, 2, 10, chunks=(5,)) >>> from sklearn.linear_model import SGDClassifier >>> sgd = SGDClassifier() >>> sgd = da.learn.fit(sgd, X, y, classes=[1, 0]) >>> sgd # doctest: +SKIP SGDClassifier(alpha=0.0001, class_weight=None, epsilon=0.1, eta0=0.0, fit_intercept=True, l1_ratio=0.15, learning_rate='optimal', loss='hinge', n_iter=5, n_jobs=1, penalty='l2', power_t=0.5, random_state=None, shuffle=False, verbose=0, warm_start=False) This passes all of X and y through the classifier sequentially. We can use the classifier as normal on in-memory data >>> import numpy as np >>> sgd.predict(np.random.random((4, 3))) # doctest: +SKIP array([1, 0, 0, 1]) Or predict on a larger dataset >>> z = da.random.random((400, 3), chunks=(100, 3)) >>> da.learn.predict(sgd, z) # doctest: +SKIP dask.array<x_11, shape=(400,), chunks=((100, 100, 100, 100),), dtype=int64> """ assert x.ndim == 2 if y is not None: assert y.ndim == 1 assert x.chunks[0] == y.chunks[0] assert hasattr(model, 'partial_fit') if len(x.chunks[1]) > 1: x = x.rechunk(chunks=(x.chunks[0], sum(x.chunks[1]))) nblocks = len(x.chunks[0]) name = 'fit-' + dask.base.tokenize(model, x, y, kwargs) dsk = {(name, -1): model} dsk.update({(name, i): (_partial_fit, (name, i - 1), (x.name, i, 0), (getattr(y, 'name', ''), i), kwargs) for i in range(nblocks)}) new_dsk = dask.sharedict.merge((name, dsk), x.dask, getattr(y, 'dask', {})) value = Delayed((name, nblocks - 1), new_dsk) if compute: return value.compute() else: return value
def fit(model, x, y, compute=True, shuffle_blocks=True, random_state=None, **kwargs): """ Fit scikit learn model against dask arrays Model must support the ``partial_fit`` interface for online or batch learning. Ideally your rows are independent and identically distributed. By default, this function will step through chunks of the arrays in random order. Parameters ---------- model: sklearn model Any model supporting partial_fit interface x: dask Array Two dimensional array, likely tall and skinny y: dask Array One dimensional array with same chunks as x's rows compute : bool Whether to compute this result shuffle_blocks : bool Whether to shuffle the blocks with ``random_state`` or not random_state : int or numpy.random.RandomState Random state to use when shuffling blocks kwargs: options to pass to partial_fit Examples -------- >>> import dask.array as da >>> X = da.random.random((10, 3), chunks=(5, 3)) >>> y = da.random.randint(0, 2, 10, chunks=(5,)) >>> from sklearn.linear_model import SGDClassifier >>> sgd = SGDClassifier() >>> sgd = da.learn.fit(sgd, X, y, classes=[1, 0]) >>> sgd # doctest: +SKIP SGDClassifier(alpha=0.0001, class_weight=None, epsilon=0.1, eta0=0.0, fit_intercept=True, l1_ratio=0.15, learning_rate='optimal', loss='hinge', n_iter=5, n_jobs=1, penalty='l2', power_t=0.5, random_state=None, shuffle=False, verbose=0, warm_start=False) This passes all of X and y through the classifier sequentially. We can use the classifier as normal on in-memory data >>> import numpy as np >>> sgd.predict(np.random.random((4, 3))) # doctest: +SKIP array([1, 0, 0, 1]) Or predict on a larger dataset >>> z = da.random.random((400, 3), chunks=(100, 3)) >>> da.learn.predict(sgd, z) # doctest: +SKIP dask.array<x_11, shape=(400,), chunks=((100, 100, 100, 100),), dtype=int64> """ if not hasattr(x, "chunks") and hasattr(x, "to_dask_array"): x = x.to_dask_array() assert x.ndim == 2 if y is not None: if not hasattr(y, "chunks") and hasattr(y, "to_dask_array"): y = y.to_dask_array() assert y.ndim == 1 assert x.chunks[0] == y.chunks[0] assert hasattr(model, "partial_fit") if len(x.chunks[1]) > 1: x = x.rechunk(chunks=(x.chunks[0], sum(x.chunks[1]))) nblocks = len(x.chunks[0]) order = list(range(nblocks)) if shuffle_blocks: rng = sklearn.utils.check_random_state(random_state) rng.shuffle(order) name = "fit-" + dask.base.tokenize(model, x, y, kwargs, order) dsk = {(name, -1): model} dsk.update({(name, i): ( _partial_fit, (name, i - 1), (x.name, order[i], 0), (getattr(y, "name", ""), order[i]), kwargs, ) for i in range(nblocks)}) graphs = {x.name: x.__dask_graph__(), name: dsk} if hasattr(y, "__dask_graph__"): graphs[y.name] = y.__dask_graph__() try: from dask.highlevelgraph import HighLevelGraph new_dsk = HighLevelGraph.merge(*graphs.values()) except ImportError: from dask import sharedict new_dsk = sharedict.merge(*graphs.values()) value = Delayed((name, nblocks - 1), new_dsk) if compute: return value.compute() else: return value
def execute_plan(self, plan: Delayed, **kwargs): return plan.compute(**kwargs)
def store_inplace(sources, targets, safe=True, **kwargs): """Evaluate a dask computation and store results in the original numpy arrays. Dask is designed to operate on immutable data: the key for a node in the graph is intended to uniquely identify the value. It's possible to create tasks that modify the backing storage, but it can potentially create race conditions where a value might be replaced either before or after it is used. This function provides safety checks that will raise an exception if there is a risk of this happening. Despite the safety checks, it still requires some user care to be used safely: - The arrays in `targets` must be backed by numpy arrays, with no computations other than slicing. Thus, the dask functions :func:`~dask.array.asarray`, :func:`~dask.array.from_array`, :func:`~dask.array.concatenate` and :func:`~dask.array.stack` are safe. - The target keys must be backed by *distinct* numpy arrays. This is not currently checked (although duplicate keys will be detected). - When creating a target array with :func:`~dask.array.from_array`, ensure that the array has a unique name (e.g., by passing ``name=False``). - The safety check only applies to the sources and targets passed to this function. Any simultaneous use of objects based on the targets is invalid, and afterwards any dask objects based on the targets will be computed with the overwritten values. The safety check is conservative i.e., there may be cases where it will throw an exception even though the operation can be proven to be safe. Each source is rechunked to match the chunks of the target. In cases where the target is backed by a single large numpy array, it may be more efficient to construct a new dask wrapper of that numpy array whose chunking matches the source. Parameters ---------- sources : iterable of :class:`dask.array.Array` Values to compute. targets : iterable of :class:`dask.array.Array` Destinations in which to store the results of computing `sources`, with the same length and matching shapes (the dtypes need not match, as long as they are assignable). safe : bool, optional If true (default), raise an exception if the operation is potentially unsafe. This can be an expensive operation (quadratic in the number of chunks). kwargs : dict Extra arguments are passed to the scheduler Raises ------ UnsafeInplaceError if a data hazard is detected ValueError if the sources and targets have the wrong type or don't match """ if isinstance(sources, da.Array): sources = [sources] targets = [targets] if any(not isinstance(s, da.Array) for s in sources): raise ValueError('All sources must be instances of da.Array') if any(not isinstance(t, da.Array) for t in targets): raise ValueError('All targets must be instances of da.Array') chunked_sources = [ source.rechunk(target.chunks) for source, target in zip(sources, targets) ] if safe: _safe_inplace(chunked_sources, targets) def store(target, source): target[:] = source out_keys = [] layers = {} dependencies = {} store_layers = [] for source, target in zip(chunked_sources, targets): name = 'store-' + source.name + '-' + target.name store_layers.append(name) indices = tuple(range(target.ndim)) layer = blockwise(store, name, indices, target.name, indices, source.name, indices, numblocks={ source.name: source.numblocks, target.name: target.numblocks }) # Replicate behaviour of HighLevelGraph.from_collections layers[name] = layer dependencies[name] = set() for collection in source, target: graph = collection.__dask_graph__() layers.update(graph.layers) dependencies.update(graph.dependencies) dependencies[name].update(collection.__dask_layers__()) out_keys.extend(layer.keys()) # We don't have any outputs from storing, so to form a dask collection # we'll gather up all the output keys into one "root" key and form a # Delayed collection from it. This is similar to what da.store does. root_key = 'store-root-' + str(uuid.uuid4()) layers[root_key] = {root_key: out_keys} dependencies[root_key] = set(store_layers) graph = HighLevelGraph(layers, dependencies) # Ensure that array-appropriate optimizations are performed. graph = da.Array.__dask_optimize__(graph, [root_key]) result = Delayed(root_key, graph) result.compute(optimize_graph=False)
def convert_criteo_to_parquet( input_path: str, output_path: str, client, gpu_mem_frac: float = 0.05, ): print("Converting tsv to parquet files") if not output_path: raise RuntimeError( "Intermediate directory must be defined, if the dataset is tsv.") os.makedirs(output_path, exist_ok=True) # split last day into two parts number_of_lines = int( subprocess.check_output(( f'wc -l {os.path.join(input_path, "day_23")}').split()).split()[0]) valid_set_size = number_of_lines // 2 test_set_size = number_of_lines - valid_set_size with open(os.path.join(input_path, "day_23.part1"), "w") as f: subprocess.run([ 'head', '-n', str(test_set_size), str(os.path.join(input_path, "day_23")) ], stdout=f) with open(os.path.join(input_path, "day_23.part2"), "w") as f: subprocess.run([ 'tail', '-n', str(valid_set_size), str(os.path.join(input_path, "day_23")) ], stdout=f) fs = get_fs_token_paths(input_path, mode="rb")[0] file_list = [ x for x in fs.glob(fs.sep.join([input_path, "day_*"])) if not x.endswith("parquet") ] file_list = sorted(file_list, key=natural_sort_key) name_list = _analyze_paths(file_list, fs)[1] cols = CRITEO_CLICK_COLUMNS + CRITEO_CONTINUOUS_COLUMNS + CRITEO_CATEGORICAL_COLUMNS dtypes = {} dtypes[CRITEO_CLICK_COLUMNS[0]] = np.int64 for x in CRITEO_CONTINUOUS_COLUMNS: dtypes[x] = np.int64 for x in CRITEO_CATEGORICAL_COLUMNS: dtypes[x] = "hex" dsk = {} token = tokenize(file_list, name_list, output_path, gpu_mem_frac, fs, cols, dtypes) convert_file_name = "convert_file-" + token for i, (path, name) in enumerate(zip(file_list, name_list)): key = (convert_file_name, i) dsk[key] = (_convert_file, path, name, output_path, gpu_mem_frac, fs, cols, dtypes) write_meta_name = "write-metadata-" + token dsk[write_meta_name] = ( _write_metadata, [(convert_file_name, i) for i in range(len(file_list))], fs, output_path, ) graph = HighLevelGraph.from_collections(write_meta_name, dsk, dependencies=[]) conversion_delayed = Delayed(write_meta_name, graph) if client: conversion_delayed.compute() else: conversion_delayed.compute(scheduler="synchronous") print("Converted")
def fit(model, x, y, compute=True, shuffle_blocks=True, random_state=None, assume_equal_chunks=False, **kwargs): """Fit scikit learn model against dask arrays Model must support the ``partial_fit`` interface for online or batch learning. Ideally your rows are independent and identically distributed. By default, this function will step through chunks of the arrays in random order. Parameters ---------- model: sklearn model Any model supporting partial_fit interface x: dask Array Two dimensional array, likely tall and skinny y: dask Array One dimensional array with same chunks as x's rows compute : bool Whether to compute this result shuffle_blocks : bool Whether to shuffle the blocks with ``random_state`` or not random_state : int or numpy.random.RandomState Random state to use when shuffling blocks kwargs: options to pass to partial_fit Examples -------- >>> import dask.array as da >>> X = da.random.random((10, 3), chunks=(5, 3)) >>> y = da.random.randint(0, 2, 10, chunks=(5,)) >>> from sklearn.linear_model import SGDClassifier >>> sgd = SGDClassifier() >>> sgd = da.learn.fit(sgd, X, y, classes=[1, 0]) >>> sgd # doctest: +SKIP SGDClassifier(alpha=0.0001, class_weight=None, epsilon=0.1, eta0=0.0, fit_intercept=True, l1_ratio=0.15, learning_rate='optimal', loss='hinge', n_iter=5, n_jobs=1, penalty='l2', power_t=0.5, random_state=None, shuffle=False, verbose=0, warm_start=False) This passes all of X and y through the classifier sequentially. We can use the classifier as normal on in-memory data >>> import numpy as np >>> sgd.predict(np.random.random((4, 3))) # doctest: +SKIP array([1, 0, 0, 1]) Or predict on a larger dataset >>> z = da.random.random((400, 3), chunks=(100, 3)) >>> da.learn.predict(sgd, z) # doctest: +SKIP dask.array<x_11, shape=(400,), chunks=((100, 100, 100, 100),), dtype=int64> """ nblocks, x_name = _blocks_and_name(x) if y is not None: y_nblocks, y_name = _blocks_and_name(y) assert y_nblocks == nblocks else: y_name = "" if not hasattr(model, "partial_fit"): msg = "The class '{}' does not implement 'partial_fit'." raise ValueError(msg.format(type(model))) order = list(range(nblocks)) if shuffle_blocks: rng = sklearn.utils.check_random_state(random_state) rng.shuffle(order) name = "fit-" + dask.base.tokenize(model, x, y, kwargs, order) if hasattr(x, "chunks") and x.ndim > 1: x_extra = (0, ) else: x_extra = () dsk = {(name, -1): model} dsk.update({(name, i): ( _partial_fit, (name, i - 1), (x_name, order[i]) + x_extra, (y_name, order[i]), kwargs, ) for i in range(nblocks)}) dependencies = [x] if y is not None: dependencies.append(y) new_dsk = HighLevelGraph.from_collections(name, dsk, dependencies=dependencies) if DASK_2022_01_0: value = Delayed((name, nblocks - 1), new_dsk, layer=name) else: value = Delayed((name, nblocks - 1), new_dsk) if compute: return value.compute() else: return value