Пример #1
0
def get(dsk, result, cache=None, **kwargs):
    """ Threaded cached implementation of dask.get

    Parameters
    ----------

    dsk: dict
        A dask dictionary specifying a workflow
    result: key or list of keys
        Keys corresponding to desired data
    nthreads: integer of thread count
        The number of threads to use in the ThreadPool that will actually execute tasks
    cache: dict-like (optional)
        Temporary storage of results

    Examples
    --------

    >>> dsk = {'x': 1, 'y': 2, 'z': (inc, 'x'), 'w': (add, 'z', 'y')}
    >>> get(dsk, 'w')
    4
    >>> get(dsk, ['w', 'y'])
    (4, 2)
    """
    pool = _globals['pool']

    if pool is None:
        pool = default_pool

    queue = Queue()
    results = get_async(pool.apply_async, len(pool._pool), dsk, result,
                        cache=cache, queue=queue, get_id=_thread_get_id,
                        **kwargs)

    return results
Пример #2
0
def get(dsk, keys, optimizations=[fuse], num_workers=cpu_count):
    """ Multiprocessed get function appropriate for Bags """
    pool = _globals['pool']
    if pool is None:
        pool = multiprocessing.Pool(psutil.cpu_count())
        cleanup = True
    else:
        cleanup = False

    manager = multiprocessing.Manager()
    queue = manager.Queue()

    apply_async = dill_apply_async(pool.apply_async)

    # Optimize Dask
    dsk2 = pipe(dsk, partial(cull, keys=keys), *optimizations)

    try:
        # Run
        result = get_async(apply_async, cpu_count, dsk2, keys,
                           queue=queue)
    finally:
        if cleanup:
            pool.close()
    return result
Пример #3
0
def get(dsk,
        keys,
        num_workers=None,
        func_loads=None,
        func_dumps=None,
        optimize_graph=True,
        **kwargs):
    """ Multiprocessed get function appropriate for Bags

    Parameters
    ----------
    dsk : dict
        dask graph
    keys : object or list
        Desired results from graph
    num_workers : int
        Number of worker processes (defaults to number of cores)
    func_dumps : function
        Function to use for function serialization
        (defaults to cloudpickle.dumps)
    func_loads : function
        Function to use for function deserialization
        (defaults to cloudpickle.loads)
    optimize_graph : bool
        If True [default], `fuse` is applied to the graph before computation.
    """
    pool = _globals['pool']
    if pool is None:
        pool = multiprocessing.Pool(num_workers)
        cleanup = True
    else:
        cleanup = False

    manager = multiprocessing.Manager()
    queue = manager.Queue()

    apply_async = pickle_apply_async(pool.apply_async,
                                     func_dumps=func_dumps,
                                     func_loads=func_loads)

    # Optimize Dask
    dsk2, dependencies = cull(dsk, keys)
    if optimize_graph:
        dsk3, dependencies = fuse(dsk2, keys, dependencies)
    else:
        dsk3 = dsk2

    try:
        # Run
        result = get_async(apply_async,
                           len(pool._pool),
                           dsk3,
                           keys,
                           queue=queue,
                           get_id=_process_get_id,
                           **kwargs)
    finally:
        if cleanup:
            pool.close()
    return result
Пример #4
0
def get(dsk,
        keys,
        optimizations=[],
        num_workers=None,
        func_loads=None,
        func_dumps=None,
        **kwargs):
    """ Multiprocessed get function appropriate for Bags

    Parameters
    ----------

    dsk: dict
        dask graph
    keys: object or list
        Desired results from graph
    optimizations: list of functions
        optimizations to perform on graph before execution
    num_workers: int
        Number of worker processes (defaults to number of cores)
    func_dumps: function
        Function to use for function serialization
        (defaults to cloudpickle.dumps)
    func_loads: function
        Function to use for function deserialization
        (defaults to cloudpickle.loads)
    """
    pool = _globals['pool']
    if pool is None:
        pool = multiprocessing.Pool(num_workers)
        cleanup = True
    else:
        cleanup = False

    manager = multiprocessing.Manager()
    queue = manager.Queue()

    apply_async = pickle_apply_async(pool.apply_async,
                                     func_dumps=func_dumps,
                                     func_loads=func_loads)

    # Optimize Dask
    dsk2 = fuse(dsk, keys)
    dsk3 = pipe(dsk2, partial(cull, keys=keys), *optimizations)

    try:
        # Run
        result = get_async(apply_async,
                           len(pool._pool),
                           dsk3,
                           keys,
                           queue=queue,
                           get_id=_process_get_id,
                           **kwargs)
    finally:
        if cleanup:
            pool.close()
    return result
Пример #5
0
def get(dsk, result, cache=None, num_workers=None, **kwargs):
    """ Threaded cached implementation of dask.get

    Parameters
    ----------

    dsk: dict
        A dask dictionary specifying a workflow
    result: key or list of keys
        Keys corresponding to desired data
    num_workers: integer of thread count
        The number of threads to use in the ThreadPool that will actually execute tasks
    cache: dict-like (optional)
        Temporary storage of results

    Examples
    --------

    >>> dsk = {'x': 1, 'y': 2, 'z': (inc, 'x'), 'w': (add, 'z', 'y')}
    >>> get(dsk, 'w')
    4
    >>> get(dsk, ['w', 'y'])
    (4, 2)
    """
    global default_pool
    pool = _globals['pool']
    thread = current_thread()

    with pools_lock:
        if pool is None:
            if num_workers is None and thread is main_thread:
                if default_pool is None:
                    default_pool = ThreadPool()
                pool = default_pool
            elif thread in pools and num_workers in pools[thread]:
                pool = pools[thread][num_workers]
            else:
                pool = ThreadPool(num_workers)
                pools[thread][num_workers] = pool

    results = get_async(pool.apply_async,
                        len(pool._pool),
                        dsk,
                        result,
                        cache=cache,
                        get_id=_thread_get_id,
                        **kwargs)

    # Cleanup pools associated to dead threads
    with pools_lock:
        active_threads = set(threading.enumerate())
        if thread is not main_thread:
            for t in list(pools):
                if t not in active_threads:
                    for p in pools.pop(t).values():
                        p.close()

    return results
Пример #6
0
def get(dsk,
        result,
        nthreads=NUM_CPUS,
        cache=None,
        debug_counts=None,
        **kwargs):
    """ Threaded cached implementation of dask.get

    Parameters
    ----------

    dsk: dict
        A dask dictionary specifying a workflow
    result: key or list of keys
        Keys corresponding to desired data
    nthreads: integer of thread count
        The number of threads to use in the ThreadPool that will actually execute tasks
    cache: dict-like (optional)
        Temporary storage of results
    debug_counts: integer or None
        This integer tells how often the scheduler should dump debugging info

    Examples
    --------

    >>> dsk = {'x': 1, 'y': 2, 'z': (inc, 'x'), 'w': (add, 'z', 'y')}
    >>> get(dsk, 'w')
    4
    >>> get(dsk, ['w', 'y'])
    (4, 2)
    """
    pool = _globals['pool']

    if pool is None:
        pool = ThreadPool(nthreads)
        cleanup = True
    else:
        cleanup = False

    queue = Queue()
    try:
        results = get_async(pool.apply_async,
                            nthreads,
                            dsk,
                            result,
                            cache=cache,
                            debug_counts=debug_counts,
                            queue=queue,
                            **kwargs)
    finally:
        if cleanup:
            pool.close()
            pool.join()

    return results
Пример #7
0
def get(dsk, result, cache=None, num_workers=None, **kwargs):
    """ Threaded cached implementation of dask.get

    Parameters
    ----------

    dsk: dict
        A dask dictionary specifying a workflow
    result: key or list of keys
        Keys corresponding to desired data
    num_workers: integer of thread count
        The number of threads to use in the ThreadPool that will actually execute tasks
    cache: dict-like (optional)
        Temporary storage of results

    Examples
    --------

    >>> dsk = {'x': 1, 'y': 2, 'z': (inc, 'x'), 'w': (add, 'z', 'y')}
    >>> get(dsk, 'w')
    4
    >>> get(dsk, ['w', 'y'])
    (4, 2)
    """
    global default_pool
    pool = _globals['pool']
    thread = current_thread()

    with pools_lock:
        if pool is None:
            if num_workers is None and thread is main_thread:
                if default_pool is None:
                    default_pool = ThreadPool()
                pool = default_pool
            elif thread in pools and num_workers in pools[thread]:
                pool = pools[thread][num_workers]
            else:
                pool = ThreadPool(num_workers)
                pools[thread][num_workers] = pool

    results = get_async(pool.apply_async, len(pool._pool), dsk, result,
                        cache=cache, get_id=_thread_get_id,
                        **kwargs)

    # Cleanup pools associated to dead threads
    with pools_lock:
        active_threads = set(threading.enumerate())
        if thread is not main_thread:
            for t in list(pools):
                if t not in active_threads:
                    for p in pools.pop(t).values():
                        p.close()

    return results
Пример #8
0
def get(dsk, keys, num_workers=None, func_loads=None, func_dumps=None,
        optimize_graph=True, **kwargs):
    """ Multiprocessed get function appropriate for Bags

    Parameters
    ----------
    dsk : dict
        dask graph
    keys : object or list
        Desired results from graph
    num_workers : int
        Number of worker processes (defaults to number of cores)
    func_dumps : function
        Function to use for function serialization
        (defaults to cloudpickle.dumps)
    func_loads : function
        Function to use for function deserialization
        (defaults to cloudpickle.loads)
    optimize_graph : bool
        If True [default], `fuse` is applied to the graph before computation.
    """
    pool = _globals['pool']
    if pool is None:
        pool = multiprocessing.Pool(num_workers,
                                    initializer=initialize_worker_process)
        cleanup = True
    else:
        cleanup = False

    # Optimize Dask
    dsk2, dependencies = cull(dsk, keys)
    if optimize_graph:
        dsk3, dependencies = fuse(dsk2, keys, dependencies)
    else:
        dsk3 = dsk2

    # We specify marshalling functions in order to catch serialization
    # errors and report them to the user.
    loads = func_loads or _globals.get('func_loads') or _loads
    dumps = func_dumps or _globals.get('func_dumps') or _dumps

    # Note former versions used a multiprocessing Manager to share
    # a Queue between parent and workers, but this is fragile on Windows
    # (issue #1652).
    try:
        # Run
        result = get_async(pool.apply_async, len(pool._pool), dsk3, keys,
                           get_id=_process_get_id,
                           dumps=dumps, loads=loads, **kwargs)
    finally:
        if cleanup:
            pool.close()
    return result
Пример #9
0
def get(dsk, keys, num_workers=None, func_loads=None, func_dumps=None,
        optimize_graph=True, **kwargs):
    """ Multiprocessed get function appropriate for Bags

    Parameters
    ----------
    dsk : dict
        dask graph
    keys : object or list
        Desired results from graph
    num_workers : int
        Number of worker processes (defaults to number of cores)
    func_dumps : function
        Function to use for function serialization
        (defaults to cloudpickle.dumps)
    func_loads : function
        Function to use for function deserialization
        (defaults to cloudpickle.loads)
    optimize_graph : bool
        If True [default], `fuse` is applied to the graph before computation.
    """
    pool = _globals['pool']
    if pool is None:
        pool = multiprocessing.Pool(num_workers)
        cleanup = True
    else:
        cleanup = False

    manager = multiprocessing.Manager()
    queue = manager.Queue()

    apply_async = pickle_apply_async(pool.apply_async,
                                     func_dumps=func_dumps,
                                     func_loads=func_loads)

    # Optimize Dask
    dsk2, dependencies = cull(dsk, keys)
    if optimize_graph:
        dsk3, dependencies = fuse(dsk2, keys, dependencies)
    else:
        dsk3 = dsk2

    try:
        # Run
        result = get_async(apply_async, len(pool._pool), dsk3, keys,
                           queue=queue, get_id=_process_get_id, **kwargs)
    finally:
        if cleanup:
            pool.close()
    return result
def get(dsk, keys, optimizations=[], num_workers=None,
        func_loads=None, func_dumps=None, **kwargs):
    """ Multiprocessed get function appropriate for Bags

    Parameters
    ----------

    dsk: dict
        dask graph
    keys: object or list
        Desired results from graph
    optimizations: list of functions
        optimizations to perform on graph before execution
    num_workers: int
        Number of worker processes (defaults to number of cores)
    func_dumps: function
        Function to use for function serialization
        (defaults to cloudpickle.dumps)
    func_loads: function
        Function to use for function deserialization
        (defaults to cloudpickle.loads)
    """
    pool = _globals['pool']
    if pool is None:
        pool = multiprocessing.Pool(num_workers)
        cleanup = True
    else:
        cleanup = False

    manager = multiprocessing.Manager()
    queue = manager.Queue()

    apply_async = pickle_apply_async(pool.apply_async,
                                          func_dumps=func_dumps,
                                          func_loads=func_loads)

    # Optimize Dask
    dsk2 = fuse(dsk, keys)
    dsk3 = pipe(dsk2, partial(cull, keys=keys), *optimizations)

    try:
        # Run
        result = get_async(apply_async, len(pool._pool), dsk3, keys,
                           queue=queue, get_id=_process_get_id, **kwargs)
    finally:
        if cleanup:
            pool.close()
    return result
Пример #11
0
def get(dsk, result, cache=None, **kwargs):
    """ Threaded cached implementation of dask.get

    Parameters
    ----------

    dsk: dict
        A dask dictionary specifying a workflow
    result: key or list of keys
        Keys corresponding to desired data
    nthreads: integer of thread count
        The number of threads to use in the ThreadPool that will actually execute tasks
    cache: dict-like (optional)
        Temporary storage of results

    Examples
    --------

    >>> dsk = {'x': 1, 'y': 2, 'z': (inc, 'x'), 'w': (add, 'z', 'y')}
    >>> get(dsk, 'w')
    4
    >>> get(dsk, ['w', 'y'])
    (4, 2)
    """
    pool = _globals['pool']

    if pool is None:
        pool = default_pool

    queue = Queue()
    results = get_async(pool.apply_async,
                        len(pool._pool),
                        dsk,
                        result,
                        cache=cache,
                        queue=queue,
                        get_id=_thread_get_id,
                        **kwargs)

    return results
Пример #12
0
def get(dsk,
        keys,
        num_workers=None,
        func_loads=None,
        func_dumps=None,
        optimize_graph=True,
        **kwargs):
    """ Multiprocessed get function appropriate for Bags

    Parameters
    ----------
    dsk : dict
        dask graph
    keys : object or list
        Desired results from graph
    num_workers : int
        Number of worker processes (defaults to number of cores)
    func_dumps : function
        Function to use for function serialization
        (defaults to cloudpickle.dumps)
    func_loads : function
        Function to use for function deserialization
        (defaults to cloudpickle.loads)
    optimize_graph : bool
        If True [default], `fuse` is applied to the graph before computation.
    """
    pool = _globals['pool']
    if pool is None:
        pool = multiprocessing.Pool(num_workers,
                                    initializer=initialize_worker_process)
        cleanup = True
    else:
        cleanup = False

    # Optimize Dask
    dsk2, dependencies = cull(dsk, keys)
    if optimize_graph:
        dsk3, dependencies = fuse(dsk2, keys, dependencies)
    else:
        dsk3 = dsk2

    # We specify marshalling functions in order to catch serialization
    # errors and report them to the user.
    loads = func_loads or _globals.get('func_loads') or _loads
    dumps = func_dumps or _globals.get('func_dumps') or _dumps

    # Note former versions used a multiprocessing Manager to share
    # a Queue between parent and workers, but this is fragile on Windows
    # (issue #1652).
    try:
        # Run
        result = get_async(pool.apply_async,
                           len(pool._pool),
                           dsk3,
                           keys,
                           get_id=_process_get_id,
                           dumps=dumps,
                           loads=loads,
                           **kwargs)
    finally:
        if cleanup:
            pool.close()
    return result