def mask(self, row_labels, col_labels): """ Lazily create a mask that extracts the indices provided. Parameters ---------- row_labels : list-like, slice or label The row labels for the rows to extract. col_labels : list-like, slice or label The column labels for the columns to extract. Returns ------- PandasOnDaskDataframePartition A new ``PandasOnDaskDataframePartition`` object. """ new_obj = super().mask(row_labels, col_labels) if isinstance(row_labels, slice) and isinstance( self._length_cache, Future): new_obj._length_cache = DaskWrapper.deploy(compute_sliced_len, row_labels, self._length_cache) if isinstance(col_labels, slice) and isinstance( self._width_cache, Future): new_obj._width_cache = DaskWrapper.deploy(compute_sliced_len, col_labels, self._width_cache) return new_obj
def drain_call_queue(self): """Execute all operations stored in the call queue on the object wrapped by this partition.""" if len(self.call_queue) == 0: return call_queue = self.call_queue if len(call_queue) > 1: futures = DaskWrapper.deploy(apply_list_of_funcs, call_queue, self.future, num_returns=2, pure=False) else: # We handle `len(call_queue) == 1` in a different way because # this improves performance a bit. func, args, kwargs = call_queue[0] futures = DaskWrapper.deploy( apply_func, self.future, func, *args, num_returns=2, pure=False, **kwargs, ) self.future = futures[0] self._ip_cache = futures[1] self.call_queue = []
def apply(self, func, *args, **kwargs): """ Apply a function to the object wrapped by this partition. Parameters ---------- func : callable A function to apply. *args : iterable Additional positional arguments to be passed in `func`. **kwargs : dict Additional keyword arguments to be passed in `func`. Returns ------- PandasOnDaskDataframePartition A new ``PandasOnDaskDataframePartition`` object. Notes ----- The keyword arguments are sent as a dictionary. """ call_queue = self.call_queue + [[func, args, kwargs]] if len(call_queue) > 1: futures = DaskWrapper.deploy(apply_list_of_funcs, call_queue, self.future, num_returns=2, pure=False) else: # We handle `len(call_queue) == 1` in a different way because # this improves performance a bit. func, args, kwargs = call_queue[0] futures = DaskWrapper.deploy( apply_func, self.future, func, *args, num_returns=2, pure=False, **kwargs, ) return PandasOnDaskDataframePartition(futures[0], ip=futures[1])
def get(self): """ Get the object wrapped by this partition out of the distributed memory. Returns ------- pandas.DataFrame The object from the distributed memory. """ self.drain_call_queue() return DaskWrapper.materialize(self.future)
def ip(self): """ Get the node IP address of the object wrapped by this partition. Returns ------- str IP address of the node that holds the data. """ if self._ip_cache is None: self._ip_cache = self.apply(lambda df: df)._ip_cache if isinstance(self._ip_cache, Future): self._ip_cache = DaskWrapper.materialize(self._ip_cache) return self._ip_cache
def width(self): """ Get the width of the object wrapped by the partition. Returns ------- int The width of the object. """ if self._width_cache is None: self._width_cache = self.apply(lambda df: len(df.columns)).future if isinstance(self._width_cache, Future): self._width_cache = DaskWrapper.materialize(self._width_cache) return self._width_cache
def get_objects_from_partitions(cls, partitions): """ Get the objects wrapped by `partitions` in parallel. Parameters ---------- partitions : np.ndarray NumPy array with ``PandasDataframePartition``-s. Returns ------- list The objects wrapped by `partitions`. """ return DaskWrapper.materialize([partition.future for partition in partitions])
def preprocess_func(cls, func): """ Preprocess a function before an ``apply`` call. Parameters ---------- func : callable The function to preprocess. Returns ------- callable An object that can be accepted by ``apply``. """ return DaskWrapper.put(func, hash=False, broadcast=True)
def put(cls, obj): """ Put an object into distributed memory and wrap it with partition object. Parameters ---------- obj : any An object to be put. Returns ------- PandasOnDaskDataframePartition A new ``PandasOnDaskDataframePartition`` object. """ return cls(DaskWrapper.put(obj, hash=False))
def deploy_func_between_two_axis_partitions(cls, axis, func, num_splits, len_of_left, other_shape, kwargs, *partitions): """ Deploy a function along a full axis between two data sets. Parameters ---------- axis : {0, 1} The axis to perform the function along. func : callable The function to perform. num_splits : int The number of splits to return (see `split_result_of_axis_func_pandas`). len_of_left : int The number of values in `partitions` that belong to the left data set. other_shape : np.ndarray The shape of right frame in terms of partitions, i.e. (other_shape[i-1], other_shape[i]) will indicate slice to restore i-1 axis partition. kwargs : dict Additional keywords arguments to be passed in `func`. *partitions : iterable All partitions that make up the full axis (row or column) for both data sets. Returns ------- list A list of distributed.Future. """ return DaskWrapper.deploy( deploy_dask_func, PandasDataframeAxisPartition. deploy_func_between_two_axis_partitions, axis, func, num_splits, len_of_left, other_shape, kwargs, *partitions, num_returns=num_splits * 4, pure=False, )
def deploy_axis_func(cls, axis, func, num_splits, kwargs, maintain_partitioning, *partitions): """ Deploy a function along a full axis. Parameters ---------- axis : {0, 1} The axis to perform the function along. func : callable The function to perform. num_splits : int The number of splits to return (see `split_result_of_axis_func_pandas`). kwargs : dict Additional keywords arguments to be passed in `func`. maintain_partitioning : bool If True, keep the old partitioning if possible. If False, create a new partition layout. *partitions : iterable All partitions that make up the full axis (row or column). Returns ------- list A list of distributed.Future. """ lengths = kwargs.get("_lengths", None) result_num_splits = len(lengths) if lengths else num_splits return DaskWrapper.deploy( deploy_dask_func, PandasDataframeAxisPartition.deploy_axis_func, axis, func, num_splits, kwargs, maintain_partitioning, *partitions, num_returns=result_num_splits * 4, pure=False, )
from modin.core.execution.dispatching.factories.dispatcher import FactoryDispatcher PartitionClass = (FactoryDispatcher.get_factory().io_cls.frame_cls. _partition_mgr_cls._partition_class) if Engine.get() == "Ray": import ray put_func = ray.put get_func = ray.get FutureType = ray.ObjectRef elif Engine.get() == "Dask": from modin.core.execution.dask.common.engine_wrapper import DaskWrapper from distributed import Future put_func = lambda x: DaskWrapper.put(x) # noqa: E731 get_func = lambda x: DaskWrapper.materialize(x) # noqa: E731 FutureType = Future elif Engine.get() == "Python": put_func = lambda x: x # noqa: E731 get_func = lambda x: x # noqa: E731 FutureType = object else: raise NotImplementedError( f"'{Engine.get()}' engine is not supported by these test suites") NPartitions.put(4) # HACK: implicit engine initialization (Modin issue #2989) pd.DataFrame([])