def test_get_scheduler(): assert get_scheduler() is None assert get_scheduler(scheduler='threads') is dask.threaded.get assert get_scheduler(scheduler='sync') is dask.local.get_sync with dask.config.set(scheduler='threads'): assert get_scheduler(scheduler='threads') is dask.threaded.get assert get_scheduler() is None
def test_get_scheduler(): assert get_scheduler() is None assert get_scheduler(scheduler="threads") is dask.threaded.get assert get_scheduler(scheduler="sync") is dask.local.get_sync with dask.config.set(scheduler="threads"): assert get_scheduler(scheduler="threads") is dask.threaded.get assert get_scheduler() is None
def test_get_scheduler(): assert get_scheduler() is None assert get_scheduler(scheduler='threads') is dask.threaded.get assert get_scheduler(scheduler='sync') is dask.local.get_sync with dask.set_options(scheduler='threads'): assert get_scheduler(scheduler='threads') is dask.threaded.get assert get_scheduler() is None
def test_get_scheduler_with_distributed_active(): with dask.config.set(scheduler="dask.distributed"): warning_message = ( "Running on a single-machine scheduler when a distributed client " "is active might lead to unexpected results." ) with pytest.warns(UserWarning, match=warning_message) as user_warnings_a: get_scheduler(scheduler="threads") get_scheduler(scheduler="sync") assert len(user_warnings_a) == 2
def _get_scheduler(get=None, collection=None): """ Determine the dask scheduler that is being used. None is returned if not dask scheduler is active. See also -------- dask.base.get_scheduler """ try: # dask 0.18.1 and later from dask.base import get_scheduler actual_get = get_scheduler(get, collection) except ImportError: try: from dask.utils import effective_get actual_get = effective_get(get, collection) except ImportError: return None try: from dask.distributed import Client if isinstance(actual_get.__self__, Client): return 'distributed' except (ImportError, AttributeError): try: import dask.multiprocessing if actual_get == dask.multiprocessing.get: return 'multiprocessing' else: return 'threaded' except ImportError: return 'threaded'
def _get_scheduler(get=None, collection=None): """Determine the dask scheduler that is being used. None is returned if no dask scheduler is active. See also -------- dask.base.get_scheduler """ try: # dask 0.18.1 and later from dask.base import get_scheduler actual_get = get_scheduler(get, collection) except ImportError: try: from dask.utils import effective_get actual_get = effective_get(get, collection) except ImportError: return None try: from dask.distributed import Client if isinstance(actual_get.__self__, Client): return 'distributed' except (ImportError, AttributeError): try: import dask.multiprocessing if actual_get == dask.multiprocessing.get: return 'multiprocessing' else: return 'threaded' except ImportError: return 'threaded'
def assert_divisions(ddf, scheduler=None): if not hasattr(ddf, "divisions"): return assert isinstance(ddf.divisions, tuple) if not getattr(ddf, "known_divisions", False): return def index(x): if is_index_like(x): return x try: return x.index.get_level_values(0) except AttributeError: return x.index get = get_scheduler(scheduler=scheduler, collections=[type(ddf)]) results = get(ddf.dask, ddf.__dask_keys__()) for i, df in enumerate(results[:-1]): if len(df): assert index(df).min() >= ddf.divisions[i] assert index(df).max() < ddf.divisions[i + 1] if len(results[-1]): assert index(results[-1]).min() >= ddf.divisions[-2] assert index(results[-1]).max() <= ddf.divisions[-1]
def test_get_scheduler(): assert get_scheduler() is None assert get_scheduler(scheduler=dask.local.get_sync) is dask.local.get_sync assert get_scheduler(scheduler="threads") is dask.threaded.get assert get_scheduler(scheduler="sync") is dask.local.get_sync assert callable(get_scheduler(scheduler=dask.local.synchronous_executor)) assert callable(get_scheduler(scheduler=MyExecutor())) with dask.config.set(scheduler="threads"): assert get_scheduler() is dask.threaded.get assert get_scheduler() is None
def get_scheduler_lock(get=None, collection=None, scheduler=None): """Get an instance of the appropriate lock for a certain situation based on scheduler used.""" from . import multiprocessing from .base import get_scheduler actual_get = get_scheduler(get=get, collections=[collection], scheduler=scheduler) if actual_get == multiprocessing.get: return mp.Manager().Lock() return SerializableLock()
def _get_scheduler(get=None, collection=None) -> Optional[str]: """Determine the dask scheduler that is being used. None is returned if no dask scheduler is active. See also -------- dask.base.get_scheduler """ try: # Fix for bug caused by dask installation that doesn't involve the toolz library # Issue: 4164 import dask from dask.base import get_scheduler # noqa: F401 actual_get = get_scheduler(get, collection) except ImportError: return None try: from dask.distributed import Client if isinstance(actual_get.__self__, Client): return "distributed" except (ImportError, AttributeError): pass try: # As of dask=2.6, dask.multiprocessing requires cloudpickle to be installed # Dependency removed in https://github.com/dask/dask/pull/5511 if actual_get is dask.multiprocessing.get: return "multiprocessing" except AttributeError: pass return "threaded"
def effective_get(get=None, collection=None): """ Deprecated: see dask.base.get_scheduler """ warnings.warn("Deprecated, see dask.base.get_scheduler instead") from dask.base import get_scheduler return get_scheduler(get=get, collections=[collection])
def to_hdf( df, path, key, mode="a", append=False, scheduler=None, name_function=None, compute=True, lock=None, dask_kwargs={}, **kwargs, ): """Store Dask Dataframe to Hierarchical Data Format (HDF) files This is a parallel version of the Pandas function of the same name. Please see the Pandas docstring for more detailed information about shared keyword arguments. This function differs from the Pandas version by saving the many partitions of a Dask DataFrame in parallel, either to many files, or to many datasets within the same file. You may specify this parallelism with an asterix ``*`` within the filename or datapath, and an optional ``name_function``. The asterix will be replaced with an increasing sequence of integers starting from ``0`` or with the result of calling ``name_function`` on each of those integers. This function only supports the Pandas ``'table'`` format, not the more specialized ``'fixed'`` format. Parameters ---------- path : string, pathlib.Path Path to a target filename. Supports strings, ``pathlib.Path``, or any object implementing the ``__fspath__`` protocol. May contain a ``*`` to denote many filenames. key : string Datapath within the files. May contain a ``*`` to denote many locations name_function : function A function to convert the ``*`` in the above options to a string. Should take in a number from 0 to the number of partitions and return a string. (see examples below) compute : bool Whether or not to execute immediately. If False then this returns a ``dask.Delayed`` value. lock : bool, Lock, optional Lock to use to prevent concurrency issues. By default a ``threading.Lock``, ``multiprocessing.Lock`` or ``SerializableLock`` will be used depending on your scheduler if a lock is required. See dask.utils.get_scheduler_lock for more information about lock selection. scheduler : string The scheduler to use, like "threads" or "processes" **other: See pandas.to_hdf for more information Examples -------- Save Data to a single file >>> df.to_hdf('output.hdf', '/data') # doctest: +SKIP Save data to multiple datapaths within the same file: >>> df.to_hdf('output.hdf', '/data-*') # doctest: +SKIP Save data to multiple files: >>> df.to_hdf('output-*.hdf', '/data') # doctest: +SKIP Save data to multiple files, using the multiprocessing scheduler: >>> df.to_hdf('output-*.hdf', '/data', scheduler='processes') # doctest: +SKIP Specify custom naming scheme. This writes files as '2000-01-01.hdf', '2000-01-02.hdf', '2000-01-03.hdf', etc.. >>> from datetime import date, timedelta >>> base = date(year=2000, month=1, day=1) >>> def name_function(i): ... ''' Convert integer 0 to n to a string ''' ... return base + timedelta(days=i) >>> df.to_hdf('*.hdf', '/data', name_function=name_function) # doctest: +SKIP Returns ------- filenames : list Returned if ``compute`` is True. List of file names that each partition is saved to. delayed : dask.Delayed Returned if ``compute`` is False. Delayed object to execute ``to_hdf`` when computed. See Also -------- read_hdf: to_parquet: """ name = "to-hdf-" + uuid.uuid1().hex pd_to_hdf = getattr(df._partition_type, "to_hdf") single_file = True single_node = True path = stringify_path(path) # if path is string, format using i_name if isinstance(path, str): if path.count("*") + key.count("*") > 1: raise ValueError( "A maximum of one asterisk is accepted in file path and dataset key" ) fmt_obj = lambda path, i_name: path.replace("*", i_name) if "*" in path: single_file = False else: if key.count("*") > 1: raise ValueError("A maximum of one asterisk is accepted in dataset key") fmt_obj = lambda path, _: path if "*" in key: single_node = False if "format" in kwargs and kwargs["format"] not in ["t", "table"]: raise ValueError("Dask only support 'table' format in hdf files.") if mode not in ("a", "w", "r+"): raise ValueError("Mode must be one of 'a', 'w' or 'r+'") if name_function is None: name_function = build_name_function(df.npartitions - 1) # we guarantee partition order is preserved when its saved and read # so we enforce name_function to maintain the order of its input. if not (single_file and single_node): formatted_names = [name_function(i) for i in range(df.npartitions)] if formatted_names != sorted(formatted_names): warn( "To preserve order between partitions name_function " "must preserve the order of its input" ) # If user did not specify scheduler and write is sequential default to the # sequential scheduler. otherwise let the _get method choose the scheduler if ( scheduler is None and not config.get("scheduler", None) and single_node and single_file ): scheduler = "single-threaded" # handle lock default based on whether we're writing to a single entity _actual_get = get_scheduler(collections=[df], scheduler=scheduler) if lock is None: if not single_node: lock = True elif not single_file and _actual_get is not MP_GET: # if we're writing to multiple files with the multiprocessing # scheduler we don't need to lock lock = True else: lock = False if lock: lock = get_scheduler_lock(df, scheduler=scheduler) kwargs.update({"format": "table", "mode": mode, "append": append}) dsk = dict() i_name = name_function(0) dsk[(name, 0)] = ( _pd_to_hdf, pd_to_hdf, lock, [(df._name, 0), fmt_obj(path, i_name), key.replace("*", i_name)], kwargs, ) kwargs2 = kwargs.copy() if single_file: kwargs2["mode"] = "a" if single_node: kwargs2["append"] = True filenames = [] for i in range(0, df.npartitions): i_name = name_function(i) filenames.append(fmt_obj(path, i_name)) for i in range(1, df.npartitions): i_name = name_function(i) task = ( _pd_to_hdf, pd_to_hdf, lock, [(df._name, i), fmt_obj(path, i_name), key.replace("*", i_name)], kwargs2, ) if single_file: link_dep = i - 1 if single_node else 0 task = (_link, (name, link_dep), task) dsk[(name, i)] = task dsk = merge(df.dask, dsk) if single_file and single_node: keys = [(name, df.npartitions - 1)] else: keys = [(name, i) for i in range(df.npartitions)] if compute: compute_as_if_collection( DataFrame, dsk, keys, scheduler=scheduler, **dask_kwargs ) return filenames else: return delayed([Delayed(k, dsk) for k in keys])