Exemplo n.º 1
0
def dask_to_tfrecords(df,
                      folder,
                      compression_type="GZIP",
                      compression_level=9):
    """Store Dask.dataframe to TFRecord files."""
    makedirs(folder, exist_ok=True)
    compression_ext = get_compression_ext(compression_type)
    filenames = [
        get_part_filename(i, compression_ext) for i in range(df.npartitions)
    ]

    # Also write a meta data file
    write_meta(df, folder, compression_type)

    dsk = {}
    name = "to-tfrecord-" + tokenize(df, folder)
    part_tasks = []
    kwargs = {}

    for d, filename in enumerate(filenames):
        dsk[(name, d)] = (apply, pandas_df_to_tfrecords, [
            (df._name, d),
            os.path.join(folder, filename), compression_type, compression_level
        ], kwargs)
        part_tasks.append((name, d))

    dsk[name] = (lambda x: None, part_tasks)

    graph = HighLevelGraph.from_collections(name, dsk, dependencies=[df])
    out = Delayed(name, graph)
    out = out.compute()
    return out
Exemplo n.º 2
0
def test_persist_delayed_custom_key(key):
    d = Delayed(key, {key: "b", "b": 1})
    assert d.compute() == 1
    dp = d.persist()
    assert dp.compute() == 1
    assert dp.key == key
    assert dict(dp.dask) == {key: 1}
Exemplo n.º 3
0
def test_persist_delayed_rename(key, rename, new_key):
    d = Delayed(key, {key: 1})
    assert d.compute() == 1
    rebuild, args = d.__dask_postpersist__()
    dp = rebuild({new_key: 2}, *args, rename=rename)
    assert dp.compute() == 2
    assert dp.key == new_key
    assert dict(dp.dask) == {new_key: 2}
Exemplo n.º 4
0
def comp(dag, blocker_list):
    from copy import deepcopy
    dag = deepcopy(dag)
    _b = convert_ldicts_to_sdict(blocker_list)

    last_node = get_lastnode(dict(_b))
    if last_node != dag.key:
        dag = Delayed(last_node, _b)
    else:
        dag.dask = _b
    x = dag.compute()
    return x
Exemplo n.º 5
0
def fit(model, x, y, compute=True, **kwargs):
    """ Fit scikit learn model against dask arrays

    Model must support the ``partial_fit`` interface for online or batch
    learning.

    This method will be called on dask arrays in sequential order.  Ideally
    your rows are independent and identically distributed.

    Parameters
    ----------
    model: sklearn model
        Any model supporting partial_fit interface
    x: dask Array
        Two dimensional array, likely tall and skinny
    y: dask Array
        One dimensional array with same chunks as x's rows
    kwargs:
        options to pass to partial_fit

    Examples
    --------
    >>> import dask.array as da
    >>> X = da.random.random((10, 3), chunks=(5, 3))
    >>> y = da.random.randint(0, 2, 10, chunks=(5,))

    >>> from sklearn.linear_model import SGDClassifier
    >>> sgd = SGDClassifier()

    >>> sgd = da.learn.fit(sgd, X, y, classes=[1, 0])
    >>> sgd  # doctest: +SKIP
    SGDClassifier(alpha=0.0001, class_weight=None, epsilon=0.1, eta0=0.0,
           fit_intercept=True, l1_ratio=0.15, learning_rate='optimal',
           loss='hinge', n_iter=5, n_jobs=1, penalty='l2', power_t=0.5,
           random_state=None, shuffle=False, verbose=0, warm_start=False)

    This passes all of X and y through the classifier sequentially.  We can use
    the classifier as normal on in-memory data

    >>> import numpy as np
    >>> sgd.predict(np.random.random((4, 3)))  # doctest: +SKIP
    array([1, 0, 0, 1])

    Or predict on a larger dataset

    >>> z = da.random.random((400, 3), chunks=(100, 3))
    >>> da.learn.predict(sgd, z)  # doctest: +SKIP
    dask.array<x_11, shape=(400,), chunks=((100, 100, 100, 100),), dtype=int64>
    """
    assert x.ndim == 2
    if y is not None:
        assert y.ndim == 1
        assert x.chunks[0] == y.chunks[0]
    assert hasattr(model, 'partial_fit')
    if len(x.chunks[1]) > 1:
        x = x.rechunk(chunks=(x.chunks[0], sum(x.chunks[1])))

    nblocks = len(x.chunks[0])

    name = 'fit-' + dask.base.tokenize(model, x, y, kwargs)
    dsk = {(name, -1): model}
    dsk.update({(name, i): (_partial_fit, (name, i - 1), (x.name, i, 0),
                            (getattr(y, 'name', ''), i), kwargs)
                for i in range(nblocks)})

    new_dsk = dask.sharedict.merge((name, dsk), x.dask, getattr(y, 'dask', {}))
    value = Delayed((name, nblocks - 1), new_dsk)

    if compute:
        return value.compute()
    else:
        return value
Exemplo n.º 6
0
def fit(model,
        x,
        y,
        compute=True,
        shuffle_blocks=True,
        random_state=None,
        **kwargs):
    """ Fit scikit learn model against dask arrays

    Model must support the ``partial_fit`` interface for online or batch
    learning.

    Ideally your rows are independent and identically distributed. By default,
    this function will step through chunks of the arrays in random order.

    Parameters
    ----------
    model: sklearn model
        Any model supporting partial_fit interface
    x: dask Array
        Two dimensional array, likely tall and skinny
    y: dask Array
        One dimensional array with same chunks as x's rows
    compute : bool
        Whether to compute this result
    shuffle_blocks : bool
        Whether to shuffle the blocks with ``random_state`` or not
    random_state : int or numpy.random.RandomState
        Random state to use when shuffling blocks
    kwargs:
        options to pass to partial_fit

    Examples
    --------
    >>> import dask.array as da
    >>> X = da.random.random((10, 3), chunks=(5, 3))
    >>> y = da.random.randint(0, 2, 10, chunks=(5,))

    >>> from sklearn.linear_model import SGDClassifier
    >>> sgd = SGDClassifier()

    >>> sgd = da.learn.fit(sgd, X, y, classes=[1, 0])
    >>> sgd  # doctest: +SKIP
    SGDClassifier(alpha=0.0001, class_weight=None, epsilon=0.1, eta0=0.0,
           fit_intercept=True, l1_ratio=0.15, learning_rate='optimal',
           loss='hinge', n_iter=5, n_jobs=1, penalty='l2', power_t=0.5,
           random_state=None, shuffle=False, verbose=0, warm_start=False)

    This passes all of X and y through the classifier sequentially.  We can use
    the classifier as normal on in-memory data

    >>> import numpy as np
    >>> sgd.predict(np.random.random((4, 3)))  # doctest: +SKIP
    array([1, 0, 0, 1])

    Or predict on a larger dataset

    >>> z = da.random.random((400, 3), chunks=(100, 3))
    >>> da.learn.predict(sgd, z)  # doctest: +SKIP
    dask.array<x_11, shape=(400,), chunks=((100, 100, 100, 100),), dtype=int64>
    """
    if not hasattr(x, "chunks") and hasattr(x, "to_dask_array"):
        x = x.to_dask_array()
    assert x.ndim == 2
    if y is not None:
        if not hasattr(y, "chunks") and hasattr(y, "to_dask_array"):
            y = y.to_dask_array()

        assert y.ndim == 1
        assert x.chunks[0] == y.chunks[0]

    assert hasattr(model, "partial_fit")
    if len(x.chunks[1]) > 1:
        x = x.rechunk(chunks=(x.chunks[0], sum(x.chunks[1])))

    nblocks = len(x.chunks[0])
    order = list(range(nblocks))
    if shuffle_blocks:
        rng = sklearn.utils.check_random_state(random_state)
        rng.shuffle(order)

    name = "fit-" + dask.base.tokenize(model, x, y, kwargs, order)
    dsk = {(name, -1): model}
    dsk.update({(name, i): (
        _partial_fit,
        (name, i - 1),
        (x.name, order[i], 0),
        (getattr(y, "name", ""), order[i]),
        kwargs,
    )
                for i in range(nblocks)})

    graphs = {x.name: x.__dask_graph__(), name: dsk}
    if hasattr(y, "__dask_graph__"):
        graphs[y.name] = y.__dask_graph__()

    try:
        from dask.highlevelgraph import HighLevelGraph

        new_dsk = HighLevelGraph.merge(*graphs.values())
    except ImportError:
        from dask import sharedict

        new_dsk = sharedict.merge(*graphs.values())

    value = Delayed((name, nblocks - 1), new_dsk)

    if compute:
        return value.compute()
    else:
        return value
Exemplo n.º 7
0
 def execute_plan(self, plan: Delayed, **kwargs):
     return plan.compute(**kwargs)
Exemplo n.º 8
0
def store_inplace(sources, targets, safe=True, **kwargs):
    """Evaluate a dask computation and store results in the original numpy arrays.

    Dask is designed to operate on immutable data: the key for a node in the
    graph is intended to uniquely identify the value. It's possible to create
    tasks that modify the backing storage, but it can potentially create race
    conditions where a value might be replaced either before or after it is
    used. This function provides safety checks that will raise an exception if
    there is a risk of this happening.

    Despite the safety checks, it still requires some user care to be used
    safely:

    - The arrays in `targets` must be backed by numpy arrays, with no
      computations other than slicing. Thus, the dask functions
      :func:`~dask.array.asarray`, :func:`~dask.array.from_array`,
      :func:`~dask.array.concatenate` and :func:`~dask.array.stack` are safe.
    - The target keys must be backed by *distinct* numpy arrays. This is not
      currently checked (although duplicate keys will be detected).
    - When creating a target array with :func:`~dask.array.from_array`,
      ensure that the array has a unique name (e.g., by passing
      ``name=False``).
    - The safety check only applies to the sources and targets passed to this
      function. Any simultaneous use of objects based on the targets is
      invalid, and afterwards any dask objects based on the targets will be
      computed with the overwritten values.

    The safety check is conservative i.e., there may be cases where it will
    throw an exception even though the operation can be proven to be safe.

    Each source is rechunked to match the chunks of the target. In cases where
    the target is backed by a single large numpy array, it may be more
    efficient to construct a new dask wrapper of that numpy array whose
    chunking matches the source.

    Parameters
    ----------
    sources : iterable of :class:`dask.array.Array`
        Values to compute.
    targets : iterable of :class:`dask.array.Array`
        Destinations in which to store the results of computing `sources`, with
        the same length and matching shapes (the dtypes need not match, as long
        as they are assignable).
    safe : bool, optional
        If true (default), raise an exception if the operation is potentially
        unsafe. This can be an expensive operation (quadratic in the number of
        chunks).
    kwargs : dict
        Extra arguments are passed to the scheduler

    Raises
    ------
    UnsafeInplaceError
        if a data hazard is detected
    ValueError
        if the sources and targets have the wrong type or don't match
    """
    if isinstance(sources, da.Array):
        sources = [sources]
        targets = [targets]

    if any(not isinstance(s, da.Array) for s in sources):
        raise ValueError('All sources must be instances of da.Array')
    if any(not isinstance(t, da.Array) for t in targets):
        raise ValueError('All targets must be instances of da.Array')

    chunked_sources = [
        source.rechunk(target.chunks)
        for source, target in zip(sources, targets)
    ]
    if safe:
        _safe_inplace(chunked_sources, targets)

    def store(target, source):
        target[:] = source

    out_keys = []
    layers = {}
    dependencies = {}
    store_layers = []
    for source, target in zip(chunked_sources, targets):
        name = 'store-' + source.name + '-' + target.name
        store_layers.append(name)
        indices = tuple(range(target.ndim))
        layer = blockwise(store,
                          name,
                          indices,
                          target.name,
                          indices,
                          source.name,
                          indices,
                          numblocks={
                              source.name: source.numblocks,
                              target.name: target.numblocks
                          })
        # Replicate behaviour of HighLevelGraph.from_collections
        layers[name] = layer
        dependencies[name] = set()
        for collection in source, target:
            graph = collection.__dask_graph__()
            layers.update(graph.layers)
            dependencies.update(graph.dependencies)
            dependencies[name].update(collection.__dask_layers__())
        out_keys.extend(layer.keys())

    # We don't have any outputs from storing, so to form a dask collection
    # we'll gather up all the output keys into one "root" key and form a
    # Delayed collection from it. This is similar to what da.store does.
    root_key = 'store-root-' + str(uuid.uuid4())
    layers[root_key] = {root_key: out_keys}
    dependencies[root_key] = set(store_layers)
    graph = HighLevelGraph(layers, dependencies)
    # Ensure that array-appropriate optimizations are performed.
    graph = da.Array.__dask_optimize__(graph, [root_key])
    result = Delayed(root_key, graph)
    result.compute(optimize_graph=False)
def convert_criteo_to_parquet(
    input_path: str,
    output_path: str,
    client,
    gpu_mem_frac: float = 0.05,
):
    print("Converting tsv to parquet files")
    if not output_path:
        raise RuntimeError(
            "Intermediate directory must be defined, if the dataset is tsv.")
    os.makedirs(output_path, exist_ok=True)

    # split last day into two parts
    number_of_lines = int(
        subprocess.check_output((
            f'wc -l {os.path.join(input_path, "day_23")}').split()).split()[0])
    valid_set_size = number_of_lines // 2
    test_set_size = number_of_lines - valid_set_size

    with open(os.path.join(input_path, "day_23.part1"), "w") as f:
        subprocess.run([
            'head', '-n',
            str(test_set_size),
            str(os.path.join(input_path, "day_23"))
        ],
                       stdout=f)

    with open(os.path.join(input_path, "day_23.part2"), "w") as f:
        subprocess.run([
            'tail', '-n',
            str(valid_set_size),
            str(os.path.join(input_path, "day_23"))
        ],
                       stdout=f)

    fs = get_fs_token_paths(input_path, mode="rb")[0]
    file_list = [
        x for x in fs.glob(fs.sep.join([input_path, "day_*"]))
        if not x.endswith("parquet")
    ]
    file_list = sorted(file_list, key=natural_sort_key)
    name_list = _analyze_paths(file_list, fs)[1]

    cols = CRITEO_CLICK_COLUMNS + CRITEO_CONTINUOUS_COLUMNS + CRITEO_CATEGORICAL_COLUMNS

    dtypes = {}
    dtypes[CRITEO_CLICK_COLUMNS[0]] = np.int64
    for x in CRITEO_CONTINUOUS_COLUMNS:
        dtypes[x] = np.int64
    for x in CRITEO_CATEGORICAL_COLUMNS:
        dtypes[x] = "hex"

    dsk = {}
    token = tokenize(file_list, name_list, output_path, gpu_mem_frac, fs, cols,
                     dtypes)
    convert_file_name = "convert_file-" + token
    for i, (path, name) in enumerate(zip(file_list, name_list)):
        key = (convert_file_name, i)
        dsk[key] = (_convert_file, path, name, output_path, gpu_mem_frac, fs,
                    cols, dtypes)

    write_meta_name = "write-metadata-" + token
    dsk[write_meta_name] = (
        _write_metadata,
        [(convert_file_name, i) for i in range(len(file_list))],
        fs,
        output_path,
    )
    graph = HighLevelGraph.from_collections(write_meta_name,
                                            dsk,
                                            dependencies=[])
    conversion_delayed = Delayed(write_meta_name, graph)

    if client:
        conversion_delayed.compute()
    else:
        conversion_delayed.compute(scheduler="synchronous")

    print("Converted")
Exemplo n.º 10
0
def fit(model,
        x,
        y,
        compute=True,
        shuffle_blocks=True,
        random_state=None,
        assume_equal_chunks=False,
        **kwargs):
    """Fit scikit learn model against dask arrays

    Model must support the ``partial_fit`` interface for online or batch
    learning.

    Ideally your rows are independent and identically distributed. By default,
    this function will step through chunks of the arrays in random order.

    Parameters
    ----------
    model: sklearn model
        Any model supporting partial_fit interface
    x: dask Array
        Two dimensional array, likely tall and skinny
    y: dask Array
        One dimensional array with same chunks as x's rows
    compute : bool
        Whether to compute this result
    shuffle_blocks : bool
        Whether to shuffle the blocks with ``random_state`` or not
    random_state : int or numpy.random.RandomState
        Random state to use when shuffling blocks
    kwargs:
        options to pass to partial_fit

    Examples
    --------
    >>> import dask.array as da
    >>> X = da.random.random((10, 3), chunks=(5, 3))
    >>> y = da.random.randint(0, 2, 10, chunks=(5,))

    >>> from sklearn.linear_model import SGDClassifier
    >>> sgd = SGDClassifier()

    >>> sgd = da.learn.fit(sgd, X, y, classes=[1, 0])
    >>> sgd  # doctest: +SKIP
    SGDClassifier(alpha=0.0001, class_weight=None, epsilon=0.1, eta0=0.0,
           fit_intercept=True, l1_ratio=0.15, learning_rate='optimal',
           loss='hinge', n_iter=5, n_jobs=1, penalty='l2', power_t=0.5,
           random_state=None, shuffle=False, verbose=0, warm_start=False)

    This passes all of X and y through the classifier sequentially.  We can use
    the classifier as normal on in-memory data

    >>> import numpy as np
    >>> sgd.predict(np.random.random((4, 3)))  # doctest: +SKIP
    array([1, 0, 0, 1])

    Or predict on a larger dataset

    >>> z = da.random.random((400, 3), chunks=(100, 3))
    >>> da.learn.predict(sgd, z)  # doctest: +SKIP
    dask.array<x_11, shape=(400,), chunks=((100, 100, 100, 100),), dtype=int64>
    """

    nblocks, x_name = _blocks_and_name(x)
    if y is not None:
        y_nblocks, y_name = _blocks_and_name(y)
        assert y_nblocks == nblocks
    else:
        y_name = ""

    if not hasattr(model, "partial_fit"):
        msg = "The class '{}' does not implement 'partial_fit'."
        raise ValueError(msg.format(type(model)))

    order = list(range(nblocks))
    if shuffle_blocks:
        rng = sklearn.utils.check_random_state(random_state)
        rng.shuffle(order)

    name = "fit-" + dask.base.tokenize(model, x, y, kwargs, order)

    if hasattr(x, "chunks") and x.ndim > 1:
        x_extra = (0, )
    else:
        x_extra = ()

    dsk = {(name, -1): model}
    dsk.update({(name, i): (
        _partial_fit,
        (name, i - 1),
        (x_name, order[i]) + x_extra,
        (y_name, order[i]),
        kwargs,
    )
                for i in range(nblocks)})

    dependencies = [x]
    if y is not None:
        dependencies.append(y)
    new_dsk = HighLevelGraph.from_collections(name,
                                              dsk,
                                              dependencies=dependencies)

    if DASK_2022_01_0:
        value = Delayed((name, nblocks - 1), new_dsk, layer=name)
    else:
        value = Delayed((name, nblocks - 1), new_dsk)

    if compute:
        return value.compute()
    else:
        return value