def apply_conjunction(parts, statistics, conjunction):
        for column, operator, value in conjunction:
            out_parts = []
            out_statistics = []
            for part, stats in zip(parts, statistics):
                if "filter" in stats and stats["filter"]:
                    continue  # Filtered by engine
                try:
                    c = toolz.groupby("name", stats["columns"])[column][0]
                    min = c["min"]
                    max = c["max"]
                except KeyError:
                    out_parts.append(part)
                    out_statistics.append(stats)
                else:
                    if (
                        operator == "=="
                        and min <= value <= max
                        or operator == "<"
                        and min < value
                        or operator == "<="
                        and min <= value
                        or operator == ">"
                        and max > value
                        or operator == ">="
                        and max >= value
                        or operator == "in"
                        and any(min <= item <= max for item in value)
                    ):
                        out_parts.append(part)
                        out_statistics.append(stats)

            parts, statistics = out_parts, out_statistics

        return parts, statistics
Exemplo n.º 2
0
def collections_to_dsk(collections, optimize_graph=True, optimizations=(), **kwargs):
    """
    Convert many collections into a single dask graph, after optimization
    """
    from .highlevelgraph import HighLevelGraph

    optimizations = tuple(optimizations) + tuple(config.get("optimizations", ()))

    if optimize_graph:
        groups = groupby(optimization_function, collections)

        graphs = []
        for opt, val in groups.items():
            dsk, keys = _extract_graph_and_keys(val)
            dsk = opt(dsk, keys, **kwargs)

            for opt in optimizations:
                dsk = opt(dsk, keys, **kwargs)

            graphs.append(dsk)

        # Merge all graphs
        if any(isinstance(graph, HighLevelGraph) for graph in graphs):
            dsk = HighLevelGraph.merge(*graphs)
        else:
            dsk = merge(*map(ensure_dict, graphs))
    else:
        dsk, _ = _extract_graph_and_keys(collections)

    return dsk
Exemplo n.º 3
0
def test_groupby_tasks_3():
    func = lambda x: x % 10
    b = db.range(20, npartitions=5).groupby(func,
                                            shuffle="tasks",
                                            max_branch=2)
    result = b.compute(scheduler="sync")
    assert dict(result) == groupby(func, range(20))
Exemplo n.º 4
0
def collections_to_dsk(collections, optimize_graph=True, **kwargs):
    """
    Convert many collections into a single dask graph, after optimization
    """
    optimizations = kwargs.pop("optimizations", None) or config.get("optimizations", [])

    if optimize_graph:
        groups = groupby(optimization_function, collections)

        _opt_list = []
        for opt, val in groups.items():
            dsk, keys = _extract_graph_and_keys(val)
            _opt = opt(dsk, keys, **kwargs)
            groups[opt] = (_opt, keys)
            _opt_list.append(_opt)

        for opt in optimizations:
            _opt_list = []
            group = {}
            for k, (dsk, keys) in groups.items():
                _opt = opt(dsk, keys, **kwargs)
                group[k] = (_opt, keys)
                _opt_list.append(_opt)
            groups = group

        dsk = merge(*map(ensure_dict, _opt_list,))
    else:
        dsk, _ = _extract_graph_and_keys(collections)

    return dsk
Exemplo n.º 5
0
 def handle_funcs(group):
     groups = tlz.groupby('group', group)
     for name in sorted(groups):
         yield ''
         yield f'    # {name}'
         for info in groups[name]:
             yield f'    {info["ctext_t"]}'
Exemplo n.º 6
0
async def scatter_to_workers(nthreads,
                             data,
                             rpc=rpc,
                             report=True,
                             serializers=None):
    """ Scatter data directly to workers

    This distributes data in a round-robin fashion to a set of workers based on
    how many cores they have.  nthreads should be a dictionary mapping worker
    identities to numbers of cores.

    See scatter for parameter docstring
    """
    assert isinstance(nthreads, dict)
    assert isinstance(data, dict)

    workers = list(concat([w] * nc for w, nc in nthreads.items()))
    names, data = list(zip(*data.items()))

    worker_iter = drop(_round_robin_counter[0] % len(workers), cycle(workers))
    _round_robin_counter[0] += len(data)

    L = list(zip(worker_iter, names, data))
    d = groupby(0, L)
    d = {
        worker: {key: value
                 for _, key, value in v}
        for worker, v in d.items()
    }

    rpcs = {addr: rpc(addr) for addr in d}
    try:
        out = await All([
            rpcs[address].update_data(data=v,
                                      report=report,
                                      serializers=serializers)
            for address, v in d.items()
        ])
    finally:
        for r in rpcs.values():
            await r.close_rpc()

    nbytes = merge(o["nbytes"] for o in out)

    who_has = {k: [w for w, _, _ in v] for k, v in groupby(1, L).items()}

    return (names, who_has, nbytes)
Exemplo n.º 7
0
def broadcast_dimensions(argpairs,
                         numblocks,
                         sentinels=(1, (1, )),
                         consolidate=None):
    """Find block dimensions from arguments

    Parameters
    ----------
    argpairs: iterable
        name, ijk index pairs
    numblocks: dict
        maps {name: number of blocks}
    sentinels: iterable (optional)
        values for singleton dimensions
    consolidate: func (optional)
        use this to reduce each set of common blocks into a smaller set

    Examples
    --------
    >>> argpairs = [('x', 'ij'), ('y', 'ji')]
    >>> numblocks = {'x': (2, 3), 'y': (3, 2)}
    >>> broadcast_dimensions(argpairs, numblocks)
    {'i': 2, 'j': 3}

    Supports numpy broadcasting rules

    >>> argpairs = [('x', 'ij'), ('y', 'ij')]
    >>> numblocks = {'x': (2, 1), 'y': (1, 3)}
    >>> broadcast_dimensions(argpairs, numblocks)
    {'i': 2, 'j': 3}

    Works in other contexts too

    >>> argpairs = [('x', 'ij'), ('y', 'ij')]
    >>> d = {'x': ('Hello', 1), 'y': (1, (2, 3))}
    >>> broadcast_dimensions(argpairs, d)
    {'i': 'Hello', 'j': (2, 3)}
    """
    # List like [('i', 2), ('j', 1), ('i', 1), ('j', 2)]
    argpairs2 = [(a, ind) for a, ind in argpairs if ind is not None]
    L = toolz.concat([
        zip(inds, dims) for (x, inds), (x, dims) in toolz.join(
            toolz.first, argpairs2, toolz.first, numblocks.items())
    ])

    g = toolz.groupby(0, L)
    g = dict((k, set([d for i, d in v])) for k, v in g.items())

    g2 = dict(
        (k, v - set(sentinels) if len(v) > 1 else v) for k, v in g.items())

    if consolidate:
        return toolz.valmap(consolidate, g2)

    if g2 and not set(map(len, g2.values())) == set([1]):
        raise ValueError("Shapes do not align %s" % g)

    return toolz.valmap(toolz.first, g2)
Exemplo n.º 8
0
Arquivo: base.py Projeto: z7ye/dask-1
def collections_to_dsk(collections, optimize_graph=True, **kwargs):
    """
    Convert many collections into a single dask graph, after optimization
    """
    from .highlevelgraph import HighLevelGraph

    optimizations = kwargs.pop("optimizations", None) or config.get(
        "optimizations", [])

    if optimize_graph:
        groups = groupby(optimization_function, collections)

        _opt_list = []
        for opt, val in groups.items():
            dsk, keys = _extract_graph_and_keys(val)
            groups[opt] = (dsk, keys)
            _opt = opt(dsk, keys, **kwargs)
            _opt_list.append(_opt)

        for opt in optimizations:
            _opt_list = []
            group = {}
            for k, (dsk, keys) in groups.items():
                _opt = opt(dsk, keys, **kwargs)
                group[k] = (_opt, keys)
                _opt_list.append(_opt)
            groups = group

        # Merge all graphs
        if any(isinstance(graph, HighLevelGraph) for graph in _opt_list):
            dsk = HighLevelGraph.merge(*_opt_list)
        else:
            dsk = merge(*map(ensure_dict, _opt_list))
    else:
        dsk, _ = _extract_graph_and_keys(collections)

    return dsk
Exemplo n.º 9
0
def plot_cache(results,
               dsk,
               start_time,
               metric_name,
               palette="Viridis",
               label_size=60,
               **kwargs):
    """Visualize the results of profiling in a bokeh plot.

    Parameters
    ----------
    results : sequence
        Output of CacheProfiler.results
    dsk : dict
        The dask graph being profiled.
    start_time : float
        Start time of the profile.
    metric_name : string
        Metric used to measure cache size
    palette : string, optional
        Name of the bokeh palette to use, must be a member of
        bokeh.palettes.all_palettes.
    label_size: int (optional)
        Maximum size of output labels in plot, defaults to 60
    **kwargs
        Other keyword arguments, passed to bokeh.figure. These will override
        all defaults set by visualize.

    Returns
    -------
    The completed bokeh plot object.
    """
    bp = import_required("bokeh.plotting", _BOKEH_MISSING_MSG)
    from bokeh.models import HoverTool

    defaults = dict(
        title="Profile Results",
        tools="hover,save,reset,wheel_zoom,xpan",
        toolbar_location="above",
        width=800,
        height=300,
    )
    # Support plot_width and plot_height for backwards compatibility
    if "plot_width" in kwargs:
        kwargs["width"] = kwargs.pop("plot_width")
    if "plot_height" in kwargs:
        kwargs["height"] = kwargs.pop("plot_height")
    defaults.update(
        (k, v) for (k, v) in kwargs.items() if k in _get_figure_keywords())

    if results:
        starts, ends = list(zip(*results))[3:]
        tics = sorted(unique(starts + ends))
        groups = groupby(lambda d: pprint_task(d[1], dsk, label_size), results)
        data = {}
        for k, vals in groups.items():
            cnts = dict.fromkeys(tics, 0)
            for v in vals:
                cnts[v.cache_time] += v.metric
                cnts[v.free_time] -= v.metric
            data[k] = [0] + list(
                accumulate(add, pluck(1, sorted(cnts.items()))))

        tics = [0] + [i - start_time for i in tics]
        p = bp.figure(x_range=[0, max(tics)], **defaults)

        for (key, val), color in zip(data.items(),
                                     get_colors(palette, data.keys())):
            p.line(
                "x",
                "y",
                line_color=color,
                line_width=3,
                source=bp.ColumnDataSource({
                    "x": tics,
                    "y": val,
                    "label": [key for i in val]
                }),
            )

    else:
        p = bp.figure(y_range=[0, 10], x_range=[0, 10], **defaults)
    p.yaxis.axis_label = "Cache Size ({0})".format(metric_name)
    p.xaxis.axis_label = "Time (s)"

    hover = p.select(HoverTool)
    hover.tooltips = """
    <div>
        <span style="font-size: 14px; font-weight: bold;">Task:</span>&nbsp;
        <span style="font-size: 10px; font-family: Monaco, monospace;">@label</span>
    </div>
    """
    return p
Exemplo n.º 10
0
def plot_tasks(results, dsk, palette="Viridis", label_size=60, **kwargs):
    """Visualize the results of profiling in a bokeh plot.

    Parameters
    ----------
    results : sequence
        Output of Profiler.results
    dsk : dict
        The dask graph being profiled.
    palette : string, optional
        Name of the bokeh palette to use, must be a member of
        bokeh.palettes.all_palettes.
    label_size: int (optional)
        Maximum size of output labels in plot, defaults to 60
    **kwargs
        Other keyword arguments, passed to bokeh.figure. These will override
        all defaults set by visualize.

    Returns
    -------
    The completed bokeh plot object.
    """
    bp = import_required("bokeh.plotting", _BOKEH_MISSING_MSG)
    from bokeh.models import HoverTool

    defaults = dict(
        title="Profile Results",
        tools="hover,save,reset,xwheel_zoom,xpan",
        toolbar_location="above",
        width=800,
        height=300,
    )
    # Support plot_width and plot_height for backwards compatibility
    if "plot_width" in kwargs:
        kwargs["width"] = kwargs.pop("plot_width")
    if "plot_height" in kwargs:
        kwargs["height"] = kwargs.pop("plot_height")
    defaults.update(
        (k, v) for (k, v) in kwargs.items() if k in _get_figure_keywords())

    if results:
        keys, tasks, starts, ends, ids = zip(*results)

        id_group = groupby(itemgetter(4), results)
        timings = dict((k, [i.end_time - i.start_time for i in v])
                       for (k, v) in id_group.items())
        id_lk = dict((t[0], n) for (n, t) in enumerate(
            sorted(timings.items(), key=itemgetter(1), reverse=True)))

        left = min(starts)
        right = max(ends)

        p = bp.figure(y_range=[str(i) for i in range(len(id_lk))],
                      x_range=[0, right - left],
                      **defaults)

        data = {}
        data["width"] = width = [e - s for (s, e) in zip(starts, ends)]
        data["x"] = [w / 2 + s - left for (w, s) in zip(width, starts)]
        data["y"] = [id_lk[i] + 1 for i in ids]
        data["function"] = funcs = [
            pprint_task(i, dsk, label_size) for i in tasks
        ]
        data["color"] = get_colors(palette, funcs)
        data["key"] = [str(i) for i in keys]

        source = bp.ColumnDataSource(data=data)

        p.rect(
            source=source,
            x="x",
            y="y",
            height=1,
            width="width",
            color="color",
            line_color="gray",
        )
    else:
        p = bp.figure(y_range=[str(i) for i in range(8)],
                      x_range=[0, 10],
                      **defaults)
    p.grid.grid_line_color = None
    p.axis.axis_line_color = None
    p.axis.major_tick_line_color = None
    p.yaxis.axis_label = "Worker ID"
    p.xaxis.axis_label = "Time (s)"

    hover = p.select(HoverTool)
    hover.tooltips = """
    <div>
        <span style="font-size: 14px; font-weight: bold;">Key:</span>&nbsp;
        <span style="font-size: 10px; font-family: Monaco, monospace;">@key</span>
    </div>
    <div>
        <span style="font-size: 14px; font-weight: bold;">Task:</span>&nbsp;
        <span style="font-size: 10px; font-family: Monaco, monospace;">@function</span>
    </div>
    """
    hover.point_policy = "follow_mouse"

    return p
Exemplo n.º 11
0
def test_groupby_tasks_2(size, npartitions, groups):
    func = lambda x: x % groups
    b = db.range(size, npartitions=npartitions).groupby(func, shuffle="tasks")
    result = b.compute(scheduler="sync")
    assert dict(result) == groupby(func, range(size))