Пример #1
0
def get_colors(palette, funcs):
    """Get a dict mapping funcs to colors from palette.

    Parameters
    ----------
    palette : string
        Name of the bokeh palette to use, must be a member of
        bokeh.palettes.all_palettes.
    funcs : iterable
        Iterable of function names
    """
    palettes = import_required("bokeh.palettes", _BOKEH_MISSING_MSG)

    unique_funcs = sorted(unique(funcs))
    n_funcs = len(unique_funcs)
    palette_lookup = palettes.all_palettes[palette]
    keys = list(sorted(palette_lookup.keys()))
    index = keys[min(bisect_left(keys, n_funcs), len(keys) - 1)]
    palette = palette_lookup[index]
    # Some bokeh palettes repeat colors, we want just the unique set
    palette = list(unique(palette))
    if len(palette) > n_funcs:
        # Consistently shuffle palette - prevents just using low-range
        random.Random(42).shuffle(palette)
    color_lookup = dict(zip(unique_funcs, cycle(palette)))
    return [color_lookup[n] for n in funcs]
Пример #2
0
def read_avro(urlpath, blocksize=100000000, storage_options=None):
    """Read set of avro files

    Use this with arbitrary nested avro schemas. Please refer to the
    fastavro documentation for its capabilities:
    https://github.com/fastavro/fastavro

    Parameters
    ----------
    urlpath: string or list
        Absolute or relative filepath, URL (may include protocols like
        ``s3://``), or globstring pointing to data.
    blocksize: int or None
        Size of chunks in bytes. If None, there will be no chunking and each
        file will become one partition.
    storage_options: dict or None
        passed to backend file-system
    """
    from dask.utils import import_required
    from dask import delayed, compute
    from dask.bytes.core import open_files, read_bytes
    from dask.bag import from_delayed
    import_required(
        'fastavro', "fastavro is a required dependency for using "
        "bag.read_avro().")

    storage_options = storage_options or {}
    files = open_files(urlpath, **storage_options)
    if blocksize is not None:
        dhead = delayed(open_head)
        heads = compute(*[dhead(f) for f in files])
        dread = delayed(read_chunk)
        bits = []
        for head, f in zip(heads, files):
            _, chunks = read_bytes(f.path,
                                   sample=False,
                                   blocksize=blocksize,
                                   delimiter=head['sync'],
                                   include_path=False,
                                   **storage_options)
            bits.extend([dread(ch, head) for ch in chunks[0]])
        return from_delayed(bits)
    else:
        files = open_files(urlpath, **storage_options)
        dread = delayed(read_file)
        chunks = [dread(fo) for fo in files]
        return from_delayed(chunks)
Пример #3
0
def read_avro(urlpath, blocksize=100000000, storage_options=None):
    """Read set of avro files

    Use this with arbitrary nested avro schemas. Please refer to the
    fastavro documentation for its capabilities:
    https://github.com/fastavro/fastavro

    Parameters
    ----------
    urlpath: string or list
        Absolute or relative filepath, URL (may include protocols like
        ``s3://``), or globstring pointing to data.
    blocksize: int or None
        Size of chunks in bytes. If None, there will be no chunking and each
        file will become one partition.
    storage_options: dict or None
        passed to backend file-system
    """
    from dask.utils import import_required
    from dask import delayed, compute
    from dask.bytes.core import open_files, read_bytes
    from dask.bag import from_delayed
    import_required('fastavro',
                    "fastavro is a required dependency for using "
                    "bag.read_avro().")

    storage_options = storage_options or {}
    files = open_files(urlpath, **storage_options)
    if blocksize is not None:
        dhead = delayed(open_head)
        heads = compute(*[dhead(f) for f in files])
        dread = delayed(read_chunk)
        bits = []
        for head, f in zip(heads, files):
            _, chunks = read_bytes(f.path, sample=False, blocksize=blocksize,
                                   delimiter=head['sync'], include_path=False,
                                   **storage_options)
            bits.extend([dread(ch, head) for ch in chunks[0]])
        return from_delayed(bits)
    else:
        files = open_files(urlpath, **storage_options)
        dread = delayed(read_file)
        chunks = [dread(fo) for fo in files]
        return from_delayed(chunks)
Пример #4
0
def make_people(npartitions=10, records_per_partition=1000, seed=None, locale="en"):
    """Make a dataset of random people

    This makes a Dask Bag with dictionary records of randomly generated people.
    This requires the optional library ``mimesis`` to generate records.

    Parameters
    ----------
    npartitions : int
        Number of partitions
    records_per_partition : int
        Number of records in each partition
    seed : int, (optional)
        Random seed
    locale : str
        Language locale, like 'en', 'fr', 'zh', or 'ru'

    Returns
    -------
    b: Dask Bag
    """
    import_required(
        "mimesis",
        "The mimesis module is required for this function.  Try:\n"
        "  python -m pip install mimesis",
    )

    schema = lambda field: {
        "age": field("person.age"),
        "name": (field("person.name"), field("person.surname")),
        "occupation": field("person.occupation"),
        "telephone": field("person.telephone"),
        "address": {"address": field("address.address"), "city": field("address.city")},
        "credit-card": {
            "number": field("payment.credit_card_number"),
            "expiration-date": field("payment.credit_card_expiration_date"),
        },
    }

    return _make_mimesis(
        {"locale": locale}, schema, npartitions, records_per_partition, seed
    )
Пример #5
0
    def run(self):

        psutil = import_required(
            "psutil", "Tracking resource usage requires `psutil` to be installed"
        )
        self.parent = psutil.Process(self.parent_pid)

        pid = current_process()
        data = []
        while True:
            try:
                msg = self.child_conn.recv()
            except KeyboardInterrupt:
                continue
            if msg == "shutdown":
                break
            elif msg == "collect":
                ps = self._update_pids(pid)
                while not data or not self.child_conn.poll():
                    tic = default_timer()
                    mem = cpu = 0
                    for p in ps:
                        try:
                            mem2 = p.memory_info().rss
                            cpu2 = p.cpu_percent()
                        except Exception:  # could be a few different exceptions
                            pass
                        else:
                            # Only increment if both were successful
                            mem += mem2
                            cpu += cpu2
                    data.append((tic, mem / 1e6, cpu))
                    sleep(self.dt)
            elif msg == "send_data":
                self.child_conn.send(data)
                data = []
        self.child_conn.close()
Пример #6
0
def initialize_event_loop(config):
    event_loop = dask.config.get("distributed.admin.event-loop")
    if event_loop == "uvloop":
        uvloop = import_required(
            "uvloop",
            "The distributed.admin.event-loop configuration value "
            "is set to 'uvloop' but the uvloop module is not installed"
            "\n\n"
            "Please either change the config value or install one of the following\n"
            "    conda install uvloop\n"
            "    pip install uvloop",
        )
        uvloop.install()
    elif event_loop in {"asyncio", "tornado"}:
        if WINDOWS:
            # WindowsProactorEventLoopPolicy is not compatible with tornado 6
            # fallback to the pre-3.8 default of Selector
            # https://github.com/tornadoweb/tornado/issues/2608
            asyncio.set_event_loop_policy(
                asyncio.WindowsSelectorEventLoopPolicy())
    else:
        raise ValueError(
            "Expected distributed.admin.event-loop to be in ('asyncio', 'tornado', 'uvloop'), got %s"
            % dask.config.get("distributed.admin.event-loop"))
Пример #7
0
def cytoscape_graph(
    dsk,
    filename: str | None = "mydask",
    format: str | None = None,
    *,
    rankdir: str = "BT",
    node_sep: float = 10,
    edge_sep: float = 10,
    spacing_factor: float = 1,
    node_style: dict[str, str] | None = None,
    edge_style: dict[str, str] | None = None,
    **kwargs,
):
    """
    Create an ipycytoscape widget for a dask graph.

    If `filename` is not None, write an HTML file to disk with the specified name.

    This uses the Cytoscape dagre layout algorithm. Options for that are documented here:
    https://github.com/cytoscape/cytoscape.js-dagre#api

    Parameters
    ----------
    dsk : dict
        The graph to display.
    filename : str or None, optional
        The name of the HTML file to write to disk.
    format : str, optional
        Not used in this engine.
    rankdir: str
        The direction in which to orient the visualization.
    node_sep: float
        The separation (in px) between nodes in the DAG layout.
    edge_sep: float
        The separation (in px) between edges in the DAG layout.
    spacing_factor: float
        An overall scaling factor to increase (>1) or decrease (<1) the spacing
        of the layout.
    node_style: dict[str, str], optional
        A dictionary of style attributes for nodes (refer to Cytoscape JSON docs
        for available options: https://js.cytoscape.org/#notation/elements-json)
    edge_style: dict[str, str], optional
        A dictionary of style attributes for edges (refer to Cytoscape JSON docs
        for available options: https://js.cytoscape.org/#notation/elements-json)
    **kwargs
        Additional keyword arguments to forward to `_to_cytoscape_json`.

    Returns
    -------
    result : ipycytoscape.CytoscapeWidget
    """
    ipycytoscape = import_required(
        "ipycytoscape",
        "Drawing dask graphs with the cytoscape engine requires the `ipycytoscape` "
        "python library.\n\n"
        "Please either conda or pip install as follows:\n\n"
        "  conda install ipycytoscape            # either conda install\n"
        "  python -m pip install ipycytoscape    # or pip install",
    )

    node_style = node_style or {}
    edge_style = edge_style or {}

    data = _to_cytoscape_json(dsk, **kwargs)
    # TODO: it's not easy to programmatically increase the height of the widget.
    # Ideally we would make it a bit bigger, but that will probably require upstreaming
    # some fixes.
    g = ipycytoscape.CytoscapeWidget(
        layout={"height": "400px"},
    )
    g.set_layout(
        name="dagre",
        rankDir=rankdir,
        nodeSep=node_sep,
        edgeSep=edge_sep,
        spacingFactor=spacing_factor,
        nodeDimensionsIncludeLabels=True,
    )
    g.graph.add_graph_from_json(
        data,
        directed=True,
    )
    g.set_style(
        [
            {
                "selector": "node",
                "style": {
                    "font-family": "helvetica",
                    "font-size": "24px",
                    "font-weight": "bold",
                    "color": "black",
                    "background-color": "#eee",
                    "border-color": "data(color)",
                    "border-width": 4,
                    "opacity": "1.0",
                    "text-valign": "center",
                    "text-halign": "center",
                    "label": "data(label)",
                    "shape": "data(shape)",
                    "width": 64,
                    "height": 64,
                    **node_style,
                },
            },
            {
                "selector": "edge",
                "style": {
                    "width": 8,
                    "line-color": "gray",
                    "target-arrow-shape": "triangle",
                    "target-arrow-color": "gray",
                    "curve-style": "bezier",
                    **edge_style,
                },
            },
        ],
    )
    # Tweak the zoom sensitivity
    z = g.zoom
    g.max_zoom = z * 2.0
    g.min_zoom = z / 10.0
    g.wheel_sensitivity = 0.1

    if filename is not None:
        from ipywidgets.embed import embed_minimal_html

        filename = filename if filename.endswith(".html") else filename + ".html"
        embed_minimal_html(filename, views=[g], title="Dask task graph")
    return g
Пример #8
0
def to_graphviz(
    dsk,
    data_attributes=None,
    function_attributes=None,
    rankdir="BT",
    graph_attr=None,
    node_attr=None,
    edge_attr=None,
    collapse_outputs=False,
    verbose=False,
    **kwargs,
):
    graphviz = import_required(
        "graphviz",
        "Drawing dask graphs with the graphviz engine requires the `graphviz` "
        "python library and the `graphviz` system library.\n\n"
        "Please either conda or pip install as follows:\n\n"
        "  conda install python-graphviz     # either conda install\n"
        "  python -m pip install graphviz    # or pip install and follow installation instructions",
    )

    data_attributes = data_attributes or {}
    function_attributes = function_attributes or {}
    graph_attr = graph_attr or {}
    node_attr = node_attr or {}
    edge_attr = edge_attr or {}

    graph_attr["rankdir"] = rankdir
    node_attr["fontname"] = "helvetica"

    graph_attr.update(kwargs)
    g = graphviz.Digraph(
        graph_attr=graph_attr, node_attr=node_attr, edge_attr=edge_attr
    )

    seen = set()
    connected = set()

    for k, v in dsk.items():
        k_name = name(k)
        if istask(v):
            func_name = name((k, "function")) if not collapse_outputs else k_name
            if collapse_outputs or func_name not in seen:
                seen.add(func_name)
                attrs = function_attributes.get(k, {}).copy()
                attrs.setdefault("label", key_split(k))
                attrs.setdefault("shape", "circle")
                g.node(func_name, **attrs)
            if not collapse_outputs:
                g.edge(func_name, k_name)
                connected.add(func_name)
                connected.add(k_name)

            for dep in get_dependencies(dsk, k):
                dep_name = name(dep)
                if dep_name not in seen:
                    seen.add(dep_name)
                    attrs = data_attributes.get(dep, {}).copy()
                    attrs.setdefault("label", box_label(dep, verbose))
                    attrs.setdefault("shape", "box")
                    g.node(dep_name, **attrs)
                g.edge(dep_name, func_name)
                connected.add(dep_name)
                connected.add(func_name)

        elif ishashable(v) and v in dsk:
            v_name = name(v)
            g.edge(v_name, k_name)
            connected.add(v_name)
            connected.add(k_name)

        if (not collapse_outputs or k_name in connected) and k_name not in seen:
            seen.add(k_name)
            attrs = data_attributes.get(k, {}).copy()
            attrs.setdefault("label", box_label(k, verbose))
            attrs.setdefault("shape", "box")
            g.node(k_name, **attrs)
    return g
Пример #9
0
def read_avro(urlpath, blocksize=100000000, storage_options=None,
              compression=None):
    """Read set of avro files

    Use this with arbitrary nested avro schemas. Please refer to the
    fastavro documentation for its capabilities:
    https://github.com/fastavro/fastavro

    Parameters
    ----------
    urlpath: string or list
        Absolute or relative filepath, URL (may include protocols like
        ``s3://``), or globstring pointing to data.
    blocksize: int or None
        Size of chunks in bytes. If None, there will be no chunking and each
        file will become one partition.
    storage_options: dict or None
        passed to backend file-system
    compression: str or None
        Compression format of the targe(s), like 'gzip'. Should only be used
        with blocksize=None.
    """
    from dask.utils import import_required
    from dask import delayed, compute
    from dask.bytes.core import (open_files, get_fs_token_paths,
                                 OpenFile, tokenize)
    from dask.bag import from_delayed
    import_required('fastavro',
                    "fastavro is a required dependency for using "
                    "bag.read_avro().")

    storage_options = storage_options or {}
    if blocksize is not None:
        fs, fs_token, paths = get_fs_token_paths(
            urlpath, mode='rb', storage_options=storage_options)
        dhead = delayed(open_head)
        out = compute(*[dhead(fs, path, compression) for path in paths])
        heads, sizes = zip(*out)
        dread = delayed(read_chunk)

        offsets = []
        lengths = []
        for size in sizes:
            off = list(range(0, size, blocksize))
            length = [blocksize] * len(off)
            offsets.append(off)
            lengths.append(length)

        out = []
        for path, offset, length, head in zip(paths, offsets, lengths, heads):
            delimiter = head['sync']
            f = OpenFile(fs, path, compression=compression)
            token = tokenize(fs_token, delimiter, path, fs.ukey(path),
                             compression, offset)
            keys = ['read-avro-%s-%s' % (o, token) for o in offset]
            values = [dread(f, o, l, head, dask_key_name=key)
                      for o, key, l in zip(offset, keys, length)]
            out.extend(values)

        return from_delayed(out)
    else:
        files = open_files(urlpath, compression=compression, **storage_options)
        dread = delayed(read_file)
        chunks = [dread(fo) for fo in files]
        return from_delayed(chunks)
Пример #10
0
def read_avro(urlpath,
              blocksize=100000000,
              storage_options=None,
              compression=None):
    """Read set of avro files

    Use this with arbitrary nested avro schemas. Please refer to the
    fastavro documentation for its capabilities:
    https://github.com/fastavro/fastavro

    Parameters
    ----------
    urlpath: string or list
        Absolute or relative filepath, URL (may include protocols like
        ``s3://``), or globstring pointing to data.
    blocksize: int or None
        Size of chunks in bytes. If None, there will be no chunking and each
        file will become one partition.
    storage_options: dict or None
        passed to backend file-system
    compression: str or None
        Compression format of the targe(s), like 'gzip'. Should only be used
        with blocksize=None.
    """
    from dask.utils import import_required
    from dask import delayed, compute
    from dask.bytes.core import (open_files, get_fs_token_paths, OpenFile,
                                 tokenize)
    from dask.bag import from_delayed
    import_required(
        'fastavro', "fastavro is a required dependency for using "
        "bag.read_avro().")

    storage_options = storage_options or {}
    if blocksize is not None:
        fs, fs_token, paths = get_fs_token_paths(
            urlpath, mode='rb', storage_options=storage_options)
        dhead = delayed(open_head)
        out = compute(*[dhead(fs, path, compression) for path in paths])
        heads, sizes = zip(*out)
        dread = delayed(read_chunk)

        offsets = []
        lengths = []
        for size in sizes:
            off = list(range(0, size, blocksize))
            length = [blocksize] * len(off)
            offsets.append(off)
            lengths.append(length)

        out = []
        for path, offset, length, head in zip(paths, offsets, lengths, heads):
            delimiter = head['sync']
            f = OpenFile(fs, path, compression=compression)
            token = tokenize(fs_token, delimiter, path, fs.ukey(path),
                             compression, offset)
            keys = ['read-avro-%s-%s' % (o, token) for o in offset]
            values = [
                dread(f, o, l, head, dask_key_name=key)
                for o, key, l in zip(offset, keys, length)
            ]
            out.extend(values)

        return from_delayed(out)
    else:
        files = open_files(urlpath, **storage_options)
        dread = delayed(read_file)
        chunks = [dread(fo) for fo in files]
        return from_delayed(chunks)
Пример #11
0
def to_avro(b,
            filename,
            schema,
            name_function=None,
            storage_options=None,
            codec="null",
            sync_interval=16000,
            metadata=None,
            compute=True,
            **kwargs):
    """Write bag to set of avro files

    The schema is a complex dictionary describing the data, see
    https://avro.apache.org/docs/1.8.2/gettingstartedpython.html#Defining+a+schema
    and https://fastavro.readthedocs.io/en/latest/writer.html .
    It's structure is as follows::

        {'name': 'Test',
         'namespace': 'Test',
         'doc': 'Descriptive text',
         'type': 'record',
         'fields': [
            {'name': 'a', 'type': 'int'},
         ]}

    where the "name" field is required, but "namespace" and "doc" are optional
    descriptors; "type" must always be "record". The list of fields should
    have an entry for every key of the input records, and the types are
    like the primitive, complex or logical types of the Avro spec
    ( https://avro.apache.org/docs/1.8.2/spec.html ).

    Results in one avro file per input partition.

    Parameters
    ----------
    b: dask.bag.Bag
    filename: list of str or str
        Filenames to write to. If a list, number must match the number of
        partitions. If a string, must includ a glob character "*", which will
        be expanded using name_function
    schema: dict
        Avro schema dictionary, see above
    name_function: None or callable
        Expands integers into strings, see
        ``dask.bytes.utils.build_name_function``
    storage_options: None or dict
        Extra key/value options to pass to the backend file-system
    codec: 'null', 'deflate', or 'snappy'
        Compression algorithm
    sync_interval: int
        Number of records to include in each block within a file
    metadata: None or dict
        Included in the file header
    compute: bool
        If True, files are written immediately, and function blocks. If False,
        returns delayed objects, which can be computed by the user where
        convenient.
    kwargs: passed to compute(), if compute=True

    Examples
    --------
    >>> import dask.bag as db
    >>> b = db.from_sequence([{'name': 'Alice', 'value': 100},
    ...                       {'name': 'Bob', 'value': 200}])
    >>> schema = {'name': 'People', 'doc': "Set of people's scores",
    ...           'type': 'record',
    ...           'fields': [
    ...               {'name': 'name', 'type': 'string'},
    ...               {'name': 'value', 'type': 'int'}]}
    >>> b.to_avro('my-data.*.avro', schema)  # doctest: +SKIP
    ['my-data.0.avro', 'my-data.1.avro']
    """
    # TODO infer schema from first partition of data
    from dask.utils import import_required
    from dask.bytes.core import open_files

    import_required(
        "fastavro",
        "fastavro is a required dependency for using bag.to_avro().")
    _verify_schema(schema)

    storage_options = storage_options or {}
    files = open_files(filename,
                       "wb",
                       name_function=name_function,
                       num=b.npartitions,
                       **storage_options)
    name = "to-avro-" + uuid.uuid4().hex
    dsk = {(name, i): (
        _write_avro_part,
        (b.name, i),
        f,
        schema,
        codec,
        sync_interval,
        metadata,
    )
           for i, f in enumerate(files)}
    graph = HighLevelGraph.from_collections(name, dsk, dependencies=[b])
    out = type(b)(graph, name, b.npartitions)
    if compute:
        out.compute(**kwargs)
        return [f.path for f in files]
    else:
        return out.to_delayed()
Пример #12
0
def plot_cache(
    results, dsk, start_time, metric_name, palette="Viridis", label_size=60, **kwargs
):
    """Visualize the results of profiling in a bokeh plot.

    Parameters
    ----------
    results : sequence
        Output of CacheProfiler.results
    dsk : dict
        The dask graph being profiled.
    start_time : float
        Start time of the profile.
    metric_name : string
        Metric used to measure cache size
    palette : string, optional
        Name of the bokeh palette to use, must be a member of
        bokeh.palettes.all_palettes.
    label_size: int (optional)
        Maximum size of output labels in plot, defaults to 60
    **kwargs
        Other keyword arguments, passed to bokeh.figure. These will override
        all defaults set by visualize.

    Returns
    -------
    The completed bokeh plot object.
    """
    bp = import_required("bokeh.plotting", _BOKEH_MISSING_MSG)
    from bokeh.models import HoverTool

    defaults = dict(
        title="Profile Results",
        tools="hover,save,reset,wheel_zoom,xpan",
        toolbar_location="above",
        width=800,
        height=300,
    )
    # Support plot_width and plot_height for backwards compatibility
    if "plot_width" in kwargs:
        kwargs["width"] = kwargs.pop("plot_width")
        if BOKEH_VERSION().major >= 3:
            warnings.warn("Use width instead of plot_width with Bokeh >= 3")
    if "plot_height" in kwargs:
        kwargs["height"] = kwargs.pop("plot_height")
        if BOKEH_VERSION().major >= 3:
            warnings.warn("Use height instead of plot_height with Bokeh >= 3")
    defaults.update(**kwargs)

    if results:
        starts, ends = list(zip(*results))[3:]
        tics = sorted(unique(starts + ends))
        groups = groupby(lambda d: pprint_task(d[1], dsk, label_size), results)
        data = {}
        for k, vals in groups.items():
            cnts = dict.fromkeys(tics, 0)
            for v in vals:
                cnts[v.cache_time] += v.metric
                cnts[v.free_time] -= v.metric
            data[k] = [0] + list(accumulate(add, pluck(1, sorted(cnts.items()))))

        tics = [0] + [i - start_time for i in tics]
        p = bp.figure(x_range=[0, max(tics)], **defaults)

        for (key, val), color in zip(data.items(), get_colors(palette, data.keys())):
            p.line(
                "x",
                "y",
                line_color=color,
                line_width=3,
                source=bp.ColumnDataSource(
                    {"x": tics, "y": val, "label": [key for i in val]}
                ),
            )

    else:
        p = bp.figure(y_range=[0, 10], x_range=[0, 10], **defaults)
    p.yaxis.axis_label = f"Cache Size ({metric_name})"
    p.xaxis.axis_label = "Time (s)"

    hover = p.select(HoverTool)
    hover.tooltips = """
    <div>
        <span style="font-size: 14px; font-weight: bold;">Task:</span>&nbsp;
        <span style="font-size: 10px; font-family: Monaco, monospace;">@label</span>
    </div>
    """
    return p
Пример #13
0
def plot_resources(results, palette="Viridis", **kwargs):
    """Plot resource usage in a bokeh plot.

    Parameters
    ----------
    results : sequence
        Output of ResourceProfiler.results
    palette : string, optional
        Name of the bokeh palette to use, must be a member of
        bokeh.palettes.all_palettes.
    **kwargs
        Other keyword arguments, passed to bokeh.figure. These will override
        all defaults set by plot_resources.

    Returns
    -------
    The completed bokeh plot object.
    """
    bp = import_required("bokeh.plotting", _BOKEH_MISSING_MSG)
    from bokeh import palettes
    from bokeh.models import LinearAxis, Range1d

    defaults = dict(
        title="Profile Results",
        tools="save,reset,xwheel_zoom,xpan",
        toolbar_location="above",
        width=800,
        height=300,
    )
    # Support plot_width and plot_height for backwards compatibility
    if "plot_width" in kwargs:
        kwargs["width"] = kwargs.pop("plot_width")
        if BOKEH_VERSION().major >= 3:
            warnings.warn("Use width instead of plot_width with Bokeh >= 3")
    if "plot_height" in kwargs:
        kwargs["height"] = kwargs.pop("plot_height")
        if BOKEH_VERSION().major >= 3:
            warnings.warn("Use height instead of plot_height with Bokeh >= 3")

    # Drop `label_size` to match `plot_cache` and `plot_tasks` kwargs
    if "label_size" in kwargs:
        kwargs.pop("label_size")

    defaults.update(**kwargs)

    if results:
        t, mem, cpu = zip(*results)
        left, right = min(t), max(t)
        t = [i - left for i in t]
        p = bp.figure(
            y_range=fix_bounds(0, max(cpu), 100),
            x_range=fix_bounds(0, right - left, 1),
            **defaults,
        )
    else:
        t = mem = cpu = []
        p = bp.figure(y_range=(0, 100), x_range=(0, 1), **defaults)
    colors = palettes.all_palettes[palette][6]
    p.line(
        t,
        cpu,
        color=colors[0],
        line_width=4,
        legend_label="% CPU",
    )
    p.yaxis.axis_label = "% CPU"
    p.extra_y_ranges = {
        "memory": Range1d(
            *fix_bounds(min(mem) if mem else 0, max(mem) if mem else 100, 100)
        )
    }
    p.line(
        t,
        mem,
        color=colors[2],
        y_range_name="memory",
        line_width=4,
        legend_label="Memory",
    )
    p.add_layout(LinearAxis(y_range_name="memory", axis_label="Memory (MB)"), "right")
    p.xaxis.axis_label = "Time (s)"
    return p
Пример #14
0
def plot_tasks(results, dsk, palette="Viridis", label_size=60, **kwargs):
    """Visualize the results of profiling in a bokeh plot.

    Parameters
    ----------
    results : sequence
        Output of Profiler.results
    dsk : dict
        The dask graph being profiled.
    palette : string, optional
        Name of the bokeh palette to use, must be a member of
        bokeh.palettes.all_palettes.
    label_size: int (optional)
        Maximum size of output labels in plot, defaults to 60
    **kwargs
        Other keyword arguments, passed to bokeh.figure. These will override
        all defaults set by visualize.

    Returns
    -------
    The completed bokeh plot object.
    """
    bp = import_required("bokeh.plotting", _BOKEH_MISSING_MSG)
    from bokeh.models import HoverTool

    defaults = dict(
        title="Profile Results",
        tools="hover,save,reset,xwheel_zoom,xpan",
        toolbar_location="above",
        width=800,
        height=300,
    )
    # Support plot_width and plot_height for backwards compatibility
    if "plot_width" in kwargs:
        kwargs["width"] = kwargs.pop("plot_width")
    if "plot_height" in kwargs:
        kwargs["height"] = kwargs.pop("plot_height")
    defaults.update(**kwargs)

    if results:
        keys, tasks, starts, ends, ids = zip(*results)

        id_group = groupby(itemgetter(4), results)
        timings = {
            k: [i.end_time - i.start_time for i in v] for (k, v) in id_group.items()
        }
        id_lk = {
            t[0]: n
            for (n, t) in enumerate(
                sorted(timings.items(), key=itemgetter(1), reverse=True)
            )
        }

        left = min(starts)
        right = max(ends)

        p = bp.figure(
            y_range=[str(i) for i in range(len(id_lk))],
            x_range=[0, right - left],
            **defaults,
        )

        data = {}
        data["width"] = width = [e - s for (s, e) in zip(starts, ends)]
        data["x"] = [w / 2 + s - left for (w, s) in zip(width, starts)]
        data["y"] = [id_lk[i] + 1 for i in ids]
        data["function"] = funcs = [pprint_task(i, dsk, label_size) for i in tasks]
        data["color"] = get_colors(palette, funcs)
        data["key"] = [str(i) for i in keys]

        source = bp.ColumnDataSource(data=data)

        p.rect(
            source=source,
            x="x",
            y="y",
            height=1,
            width="width",
            color="color",
            line_color="gray",
        )
    else:
        p = bp.figure(y_range=[str(i) for i in range(8)], x_range=[0, 10], **defaults)
    p.grid.grid_line_color = None
    p.axis.axis_line_color = None
    p.axis.major_tick_line_color = None
    p.yaxis.axis_label = "Worker ID"
    p.xaxis.axis_label = "Time (s)"

    hover = p.select(HoverTool)
    hover.tooltips = """
    <div>
        <span style="font-size: 14px; font-weight: bold;">Key:</span>&nbsp;
        <span style="font-size: 10px; font-family: Monaco, monospace;">@key</span>
    </div>
    <div>
        <span style="font-size: 14px; font-weight: bold;">Task:</span>&nbsp;
        <span style="font-size: 10px; font-family: Monaco, monospace;">@function</span>
    </div>
    """
    hover.point_policy = "follow_mouse"

    return p
Пример #15
0
def visualize(
    profilers, filename="profile.html", show=True, save=None, mode=None, **kwargs
):
    """Visualize the results of profiling in a bokeh plot.

    If multiple profilers are passed in, the plots are stacked vertically.

    Parameters
    ----------
    profilers : profiler or list
        Profiler or list of profilers.
    filename : string, optional
        Name of the plot output file.
    show : boolean, optional
        If True (default), the plot is opened in a browser.
    save : boolean, optional
        If True (default when not in notebook), the plot is saved to disk.
    mode : str, optional
        Mode passed to bokeh.output_file()
    **kwargs
        Other keyword arguments, passed to bokeh.figure. These will override
        all defaults set by visualize.

    Returns
    -------
    The completed bokeh plot object.
    """
    bp = import_required("bokeh.plotting", _BOKEH_MISSING_MSG)
    from bokeh.io import state

    if "file_path" in kwargs:
        warnings.warn(
            "The file_path keyword argument is deprecated "
            "and will be removed in a future release. "
            "Please use filename instead.",
            category=FutureWarning,
            stacklevel=2,
        )
        filename = kwargs.pop("file_path")

    if save is None:
        save = not state.curstate().notebook

    if not isinstance(profilers, list):
        profilers = [profilers]
    figs = [prof._plot(**kwargs) for prof in profilers]
    # Stack the plots
    if len(figs) == 1:
        p = figs[0]
    else:
        top = figs[0]
        for f in figs[1:]:
            f.x_range = top.x_range
            f.title = None
            f.min_border_top = 20
            if BOKEH_VERSION().major < 3:
                f.plot_height -= 30
            else:
                f.height -= 30
        for f in figs[:-1]:
            f.xaxis.axis_label = None
            f.min_border_bottom = 20
            if BOKEH_VERSION().major < 3:
                f.plot_height -= 30
            else:
                f.height -= 30
        for f in figs:
            f.min_border_left = 75
            f.min_border_right = 75
        p = bp.gridplot([[f] for f in figs])
    if show:
        bp.show(p)
    if save:
        bp.output_file(filename, mode=mode)
        bp.save(p)
    return p
Пример #16
0
def percentile(a, q, method="linear", internal_method="default", **kwargs):
    """Approximate percentile of 1-D array

    Parameters
    ----------
    a : Array
    q : array_like of float
        Percentile or sequence of percentiles to compute, which must be between
        0 and 100 inclusive.
    method : {'linear', 'lower', 'higher', 'midpoint', 'nearest'}, optional
        The interpolation method to use when the desired percentile lies
        between two data points ``i < j``. Only valid for ``method='dask'``.

        - 'linear': ``i + (j - i) * fraction``, where ``fraction``
          is the fractional part of the index surrounded by ``i``
          and ``j``.
        - 'lower': ``i``.
        - 'higher': ``j``.
        - 'nearest': ``i`` or ``j``, whichever is nearest.
        - 'midpoint': ``(i + j) / 2``.

        .. versionchanged:: 2022.1.0
            This argument was previously called "interpolation"

    internal_method : {'default', 'dask', 'tdigest'}, optional
        What internal method to use. By default will use dask's internal custom
        algorithm (``'dask'``).  If set to ``'tdigest'`` will use tdigest for
        floats and ints and fallback to the ``'dask'`` otherwise.

        .. versionchanged:: 2022.1.0
            This argument was previously called “method”.

    interpolation : str, optional
        Deprecated name for the method keyword argument.

        .. deprecated:: 2022.1.0

    See Also
    --------
    numpy.percentile : Numpy's equivalent Percentile function
    """
    from .dispatch import percentile_lookup as _percentile
    from .utils import array_safe, meta_from_array

    allowed_internal_methods = ["default", "dask", "tdigest"]

    if method in allowed_internal_methods:
        warnings.warn(
            "In Dask 2022.1.0, the `method=` argument was renamed to `internal_method=`",
            FutureWarning,
        )
        internal_method = method

    if "interpolation" in kwargs:
        warnings.warn(
            "In Dask 2022.1.0, the `interpolation=` argument to percentile was renamed to "
            "`method= ` ",
            FutureWarning,
        )
        method = kwargs.pop("interpolation")

    if kwargs:
        raise TypeError(
            f"percentile() got an unexpected keyword argument {kwargs.keys()}")

    if not a.ndim == 1:
        raise NotImplementedError(
            "Percentiles only implemented for 1-d arrays")
    if isinstance(q, Number):
        q = [q]
    q = array_safe(q, like=meta_from_array(a))
    token = tokenize(a, q, method)

    dtype = a.dtype
    if np.issubdtype(dtype, np.integer):
        dtype = (array_safe([], dtype=dtype, like=meta_from_array(a)) /
                 0.5).dtype
    meta = meta_from_array(a, dtype=dtype)

    if internal_method not in allowed_internal_methods:
        raise ValueError(
            f"`internal_method=` must be one of {allowed_internal_methods}")

    # Allow using t-digest if method is allowed and dtype is of floating or integer type
    if (internal_method == "tdigest" and method == "linear"
            and (np.issubdtype(dtype, np.floating)
                 or np.issubdtype(dtype, np.integer))):

        from dask.utils import import_required

        import_required(
            "crick",
            "crick is a required dependency for using the t-digest method.")

        name = "percentile_tdigest_chunk-" + token
        dsk = {(name, i): (_tdigest_chunk, key)
               for i, key in enumerate(a.__dask_keys__())}

        name2 = "percentile_tdigest-" + token

        dsk2 = {(name2, 0): (_percentiles_from_tdigest, q, sorted(dsk))}

    # Otherwise use the custom percentile algorithm
    else:
        # Add 0 and 100 during calculation for more robust behavior (hopefully)
        calc_q = np.pad(q, 1, mode="constant")
        calc_q[-1] = 100
        name = "percentile_chunk-" + token
        dsk = {(name, i): (_percentile, key, calc_q, method)
               for i, key in enumerate(a.__dask_keys__())}

        name2 = "percentile-" + token
        dsk2 = {
            (name2, 0): (
                merge_percentiles,
                q,
                [calc_q] * len(a.chunks[0]),
                sorted(dsk),
                method,
            )
        }

    dsk = merge(dsk, dsk2)
    graph = HighLevelGraph.from_collections(name2, dsk, dependencies=[a])
    return Array(graph, name2, chunks=((len(q), ), ), meta=meta)
Пример #17
0
from .threadpoolexecutor import rejoin
from .utils import CancelledError, TimeoutError, sync
from .variable import Variable
from .worker import Reschedule, Worker, get_client, get_worker, secede
from .worker_client import local_client, worker_client

versions = get_versions()
__version__ = versions["version"]
__git_revision__ = versions["full-revisionid"]
del get_versions, versions

if dask.config.get("distributed.admin.event-loop") in ("asyncio", "tornado"):
    pass
elif dask.config.get("distributed.admin.event-loop") == "uvloop":
    import_required(
        "uvloop",
        "The distributed.admin.event-loop configuration value "
        "is set to 'uvloop' but the uvloop module is not installed"
        "\n\n"
        "Please either change the config value or install one of the following\n"
        "    conda install uvloop\n"
        "    pip install uvloop",
    )
    import uvloop

    uvloop.install()
else:
    raise ValueError(
        "Expected distributed.admin.event-loop to be in ('asyncio', 'tornado', 'uvloop'), got %s"
        % dask.config.get("distributed.admin.event-loop"))
Пример #18
0
import os
import re
from functools import partial

from dask.core import get_dependencies, ishashable, istask
from dask.utils import apply, funcname, import_required, key_split

graphviz = import_required(
    "graphviz",
    "Drawing dask graphs requires the `graphviz` python library and the "
    "`graphviz` system library.\n\n"
    "Please either conda or pip install as follows:\n\n"
    "  conda install python-graphviz     # either conda install\n"
    "  python -m pip install graphviz    # or pip install and follow installation instructions",
)


def task_label(task):
    """Label for a task on a dot graph.

    Examples
    --------
    >>> from operator import add
    >>> task_label((add, 1, 2))
    'add'
    >>> task_label((add, (add, 1, 2), 3))
    'add(...)'
    """
    func = task[0]
    if func is apply:
        func = task[1]
Пример #19
0
def to_graphviz(
    hg,
    data_attributes=None,
    function_attributes=None,
    rankdir="BT",
    graph_attr=None,
    node_attr=None,
    edge_attr=None,
    **kwargs,
):
    from dask.dot import label, name

    graphviz = import_required(
        "graphviz",
        "Drawing dask graphs with the graphviz visualization engine requires the `graphviz` "
        "python library and the `graphviz` system library.\n\n"
        "Please either conda or pip install as follows:\n\n"
        "  conda install python-graphviz     # either conda install\n"
        "  python -m pip install graphviz    # or pip install and follow installation instructions",
    )

    data_attributes = data_attributes or {}
    function_attributes = function_attributes or {}
    graph_attr = graph_attr or {}
    node_attr = node_attr or {}
    edge_attr = edge_attr or {}

    graph_attr["rankdir"] = rankdir
    node_attr["shape"] = "box"
    node_attr["fontname"] = "helvetica"

    graph_attr.update(kwargs)
    g = graphviz.Digraph(graph_attr=graph_attr,
                         node_attr=node_attr,
                         edge_attr=edge_attr)

    n_tasks = {}
    for layer in hg.dependencies:
        n_tasks[layer] = len(hg.layers[layer])

    min_tasks = min(n_tasks.values())
    max_tasks = max(n_tasks.values())

    cache = {}

    color = kwargs.get("color")
    if color == "layer_type":
        layer_colors = {
            "DataFrameIOLayer": ["#CCC7F9", False],  # purple
            "ShuffleLayer": ["#F9CCC7", False],  # rose
            "SimpleShuffleLayer": ["#F9CCC7", False],  # rose
            "ArrayOverlayLayer": ["#FFD9F2", False],  # pink
            "BroadcastJoinLayer": ["#D9F2FF", False],  # blue
            "Blockwise": ["#D9FFE6", False],  # green
            "BlockwiseLayer": ["#D9FFE6", False],  # green
            "MaterializedLayer": ["#DBDEE5", False],  # gray
        }

    for layer in hg.dependencies:
        layer_name = name(layer)
        attrs = data_attributes.get(layer, {})

        node_label = label(layer, cache=cache)
        node_size = (20 if max_tasks == min_tasks else
                     int(20 + ((n_tasks[layer] - min_tasks) /
                               (max_tasks - min_tasks)) * 20))

        layer_type = str(type(hg.layers[layer]).__name__)
        node_tooltips = (
            f"A {layer_type.replace('Layer', '')} Layer with {n_tasks[layer]} Tasks.\n"
        )

        layer_ca = hg.layers[layer].collection_annotations
        if layer_ca:
            if layer_ca.get("type") == "dask.array.core.Array":
                node_tooltips += (
                    f"Array Shape: {layer_ca.get('shape')}\n"
                    f"Data Type: {layer_ca.get('dtype')}\n"
                    f"Chunk Size: {layer_ca.get('chunksize')}\n"
                    f"Chunk Type: {layer_ca.get('chunk_type')}\n")

            if layer_ca.get("type") == "dask.dataframe.core.DataFrame":
                dftype = {"pandas.core.frame.DataFrame": "pandas"}
                cols = layer_ca.get("columns")

                node_tooltips += (
                    f"Number of Partitions: {layer_ca.get('npartitions')}\n"
                    f"DataFrame Type: {dftype.get(layer_ca.get('dataframe_type'))}\n"
                    f"{len(cols)} DataFrame Columns: {str(cols) if len(str(cols)) <= 40 else '[...]'}\n"
                )

        attrs.setdefault("label", str(node_label))
        attrs.setdefault("fontsize", str(node_size))
        attrs.setdefault("tooltip", str(node_tooltips))

        if color == "layer_type":
            node_color = layer_colors.get(layer_type)[0]
            layer_colors.get(layer_type)[1] = True

            attrs.setdefault("fillcolor", str(node_color))
            attrs.setdefault("style", "filled")

        g.node(layer_name, **attrs)

    for layer, deps in hg.dependencies.items():
        layer_name = name(layer)
        for dep in deps:
            dep_name = name(dep)
            g.edge(dep_name, layer_name)

    if color == "layer_type":
        legend_title = "Key"

        legend_label = (
            '<<TABLE BORDER="0" CELLBORDER="1" CELLSPACING="0" CELLPADDING="5">'
            "<TR><TD><B>Legend: Layer types</B></TD></TR>")

        for layer_type, color in layer_colors.items():
            if color[1]:
                legend_label += f'<TR><TD BGCOLOR="{color[0]}">{layer_type}</TD></TR>'

        legend_label += "</TABLE>>"

        attrs = data_attributes.get(legend_title, {})
        attrs.setdefault("label", str(legend_label))
        attrs.setdefault("fontsize", "20")
        attrs.setdefault("margin", "0")

        g.node(legend_title, **attrs)

    return g
Пример #20
0
def percentile(a, q, interpolation='linear', method='default'):
    """ Approximate percentile of 1-D array

    Parameters
    ----------
    a : Array
    q : array_like of float
        Percentile or sequence of percentiles to compute, which must be between
        0 and 100 inclusive.
    interpolation : {'linear', 'lower', 'higher', 'midpoint', 'nearest'}, optional
        The interpolation method to use when the desired percentile lies
        between two data points ``i < j``. Only valid for ``method='dask'``.

        * 'linear': ``i + (j - i) * fraction``, where ``fraction``
        is the fractional part of the index surrounded by ``i``
        and ``j``.
        * 'lower': ``i``.
        * 'higher': ``j``.
        * 'nearest': ``i`` or ``j``, whichever is nearest.
        * 'midpoint': ``(i + j) / 2``.

    method : {'default', 'dask', 'tdigest'}, optional
        What method to use. By default will use dask's internal custom
        algorithm (``'dask'``).  If set to ``'tdigest'`` will use tdigest for
        floats and ints and fallback to the ``'dask'`` otherwise.

    See Also
    --------
    numpy.percentile : Numpy's equivalent Percentile function
    """
    if not a.ndim == 1:
        raise NotImplementedError(
            "Percentiles only implemented for 1-d arrays")
    if isinstance(q, Number):
        q = [q]
    q = np.array(q)
    token = tokenize(a, q, interpolation)

    dtype = a.dtype
    if np.issubdtype(dtype, np.integer):
        dtype = (np.array([], dtype=dtype) / 0.5).dtype

    allowed_methods = ['default', 'dask', 'tdigest']
    if method not in allowed_methods:
        raise ValueError("method can only be 'default', 'dask' or 'tdigest'")

    if method == 'default':
        internal_method = 'dask'
    else:
        internal_method = method

    # Allow using t-digest if interpolation is allowed and dtype is of floating or integer type
    if (internal_method == 'tdigest' and interpolation == 'linear' and
            (np.issubdtype(dtype, np.floating) or np.issubdtype(dtype, np.integer))):

        from dask.utils import import_required
        import_required('crick',
                        'crick is a required dependency for using the t-digest '
                        'method.')

        name = 'percentile_tdigest_chunk-' + token
        dsk = dict(((name, i), (_tdigest_chunk, (key)))
                   for i, key in enumerate(a.__dask_keys__()))

        name2 = 'percentile_tdigest-' + token

        dsk2 = {(name2, 0): (_percentiles_from_tdigest, q, sorted(dsk))}

    # Otherwise use the custom percentile algorithm
    else:

        name = 'percentile_chunk-' + token
        dsk = dict(((name, i), (_percentile, (key), q, interpolation))
                   for i, key in enumerate(a.__dask_keys__()))

        name2 = 'percentile-' + token
        dsk2 = {(name2, 0): (merge_percentiles, q, [q] * len(a.chunks[0]),
                             sorted(dsk), interpolation)}

    dsk = merge(dsk, dsk2)
    graph = HighLevelGraph.from_collections(name2, dsk, dependencies=[a])
    return Array(graph, name2, chunks=((len(q),),), dtype=dtype)
Пример #21
0
def percentile(a, q, interpolation="linear", method="default"):
    """Approximate percentile of 1-D array

    Parameters
    ----------
    a : Array
    q : array_like of float
        Percentile or sequence of percentiles to compute, which must be between
        0 and 100 inclusive.
    interpolation : {'linear', 'lower', 'higher', 'midpoint', 'nearest'}, optional
        The interpolation method to use when the desired percentile lies
        between two data points ``i < j``. Only valid for ``method='dask'``.

        - 'linear': ``i + (j - i) * fraction``, where ``fraction``
          is the fractional part of the index surrounded by ``i``
          and ``j``.
        - 'lower': ``i``.
        - 'higher': ``j``.
        - 'nearest': ``i`` or ``j``, whichever is nearest.
        - 'midpoint': ``(i + j) / 2``.

    method : {'default', 'dask', 'tdigest'}, optional
        What method to use. By default will use dask's internal custom
        algorithm (``'dask'``).  If set to ``'tdigest'`` will use tdigest for
        floats and ints and fallback to the ``'dask'`` otherwise.

    See Also
    --------
    numpy.percentile : Numpy's equivalent Percentile function
    """
    if not a.ndim == 1:
        raise NotImplementedError(
            "Percentiles only implemented for 1-d arrays")
    if isinstance(q, Number):
        q = [q]
    q = np.array(q)
    token = tokenize(a, q, interpolation)

    dtype = a.dtype
    if np.issubdtype(dtype, np.integer):
        dtype = (np.array([], dtype=dtype) / 0.5).dtype

    allowed_methods = ["default", "dask", "tdigest"]
    if method not in allowed_methods:
        raise ValueError("method can only be 'default', 'dask' or 'tdigest'")

    if method == "default":
        internal_method = "dask"
    else:
        internal_method = method

    # Allow using t-digest if interpolation is allowed and dtype is of floating or integer type
    if (internal_method == "tdigest" and interpolation == "linear"
            and (np.issubdtype(dtype, np.floating)
                 or np.issubdtype(dtype, np.integer))):

        from dask.utils import import_required

        import_required(
            "crick",
            "crick is a required dependency for using the t-digest method.")

        name = "percentile_tdigest_chunk-" + token
        dsk = dict(((name, i), (_tdigest_chunk, key))
                   for i, key in enumerate(a.__dask_keys__()))

        name2 = "percentile_tdigest-" + token

        dsk2 = {(name2, 0): (_percentiles_from_tdigest, q, sorted(dsk))}

    # Otherwise use the custom percentile algorithm
    else:
        # Add 0 and 100 during calculation for more robust behavior (hopefully)
        calc_q = np.pad(q, 1, mode="constant")
        calc_q[-1] = 100
        name = "percentile_chunk-" + token
        dsk = dict(((name, i), (_percentile, key, calc_q, interpolation))
                   for i, key in enumerate(a.__dask_keys__()))

        name2 = "percentile-" + token
        dsk2 = {
            (name2, 0): (
                merge_percentiles,
                q,
                [calc_q] * len(a.chunks[0]),
                sorted(dsk),
                interpolation,
            )
        }

    dsk = merge(dsk, dsk2)
    graph = HighLevelGraph.from_collections(name2, dsk, dependencies=[a])
    return Array(graph, name2, chunks=((len(q), ), ), dtype=dtype)
Пример #22
0
def to_avro(b, filename, schema, name_function=None, storage_options=None,
            codec='null', sync_interval=16000, metadata=None, compute=True,
            **kwargs):
    """Write bag to set of avro files

    The schema is a complex dictionary dscribing the data, see
    https://avro.apache.org/docs/1.8.2/gettingstartedpython.html#Defining+a+schema
    and https://fastavro.readthedocs.io/en/latest/writer.html .
    It's structure is as follows::

        {'name': 'Test',
         'namespace': 'Test',
         'doc': 'Descriptive text',
         'type': 'record',
         'fields': [
            {'name': 'a', 'type': 'int'},
         ]}

    where the "name" field is required, but "namespace" and "doc" are optional
    descriptors; "type" must always be "record". The list of fields should
    have an entry for every key of the input records, and the types are
    like the primitive, complex or logical types of the Avro spec
    ( https://avro.apache.org/docs/1.8.2/spec.html ).

    Results in one avro file per input partition.

    Parameters
    ----------
    b: dask.bag.Bag
    filename: list of str or str
        Filenames to write to. If a list, number must match the number of
        partitions. If a string, must includ a glob character "*", which will
        be expanded using name_function
    schema: dict
        Avro schema dictionary, see above
    name_function: None or callable
        Expands integers into strings, see
        ``dask.bytes.utils.build_name_function``
    storage_options: None or dict
        Extra key/value options to pass to the backend file-system
    codec: 'null', 'deflate', or 'snappy'
        Compression algorithm
    sync_interval: int
        Number of records to include in each block within a file
    metadata: None or dict
        Included in the file header
    compute: bool
        If True, files are written immediately, and function blocks. If False,
        returns delayed objects, which can be computed by the user where
        convenient.
    kwargs: passed to compute(), if compute=True

    Examples
    --------
    >>> import dask.bag as db
    >>> b = db.from_sequence([{'name': 'Alice', 'value': 100},
    ...                       {'name': 'Bob', 'value': 200}])
    >>> schema = {'name': 'People', 'doc': "Set of people's scores",
    ...           'type': 'record',
    ...           'fields': [
    ...               {'name': 'name', 'type': 'string'},
    ...               {'name': 'value', 'type': 'int'}]}
    >>> b.to_avro('my-data.*.avro', schema)  # doctest: +SKIP
    ['my-data.0.avro', 'my-data.1.avro']
    """
    # TODO infer schema from first partition of data
    from dask.utils import import_required
    from dask.bytes.core import open_files
    import_required('fastavro',
                    "fastavro is a required dependency for using "
                    "bag.to_avro().")
    _verify_schema(schema)

    storage_options = storage_options or {}
    files = open_files(filename, 'wb', name_function=name_function,
                       num=b.npartitions, **storage_options)
    name = 'to-avro-' + uuid.uuid4().hex
    dsk = {(name, i): (_write_avro_part, (b.name, i), f, schema, codec,
                       sync_interval, metadata)
           for i, f in enumerate(files)}
    graph = HighLevelGraph.from_collections(name, dsk, dependencies=[b])
    out = type(b)(graph, name, b.npartitions)
    if compute:
        out.compute(**kwargs)
        return [f.path for f in files]
    else:
        return out.to_delayed()