예제 #1
0
def from_modin(df):
    # pylint: disable = import-outside-toplevel, protected-access, unidiomatic-typecheck
    import numpy as np

    try:
        from modin.pandas.dataframe import DataFrame
        from modin.engines.ray.pandas_on_ray.frame.data import PandasOnRayFrame
        from modin.engines.ray.pandas_on_ray.frame.partition import (
            PandasOnRayFramePartition, )
    except Exception as e:
        raise Exception(
            "Unable to import modin. Install modin with command 'pip install modin'"
        ) from e

    assert isinstance(
        df, DataFrame), "Unexpected dataframe type %s" % str(type(df))
    assert isinstance(df._query_compiler._modin_frame,
                      PandasOnRayFrame), "Unexpected dataframe type %s" % str(
                          type(df._query_compiler._modin_frame))
    frame: PandasOnRayFrame = df._query_compiler._modin_frame

    app: ArrayApplication = _instance()
    system = app.cm

    # Make sure the partitions are numeric.
    dtype = frame.dtypes[0]
    if not array_utils.is_supported(dtype, type_test=True):
        raise TypeError("%s is not supported." % str(dtype))
    for dt in frame.dtypes:
        if dt != dtype:
            raise TypeError("Mixed types are not supported (%s != %s).")

    dtype = np.__getattribute__(str(dtype))

    # Convert from Pandas to NumPy.
    pd_parts = frame._partition_mgr_cls.map_partitions(frame._partitions,
                                                       lambda df: np.array(df))
    grid_shape = len(frame._row_lengths), len(frame._column_widths)

    shape = (np.sum(frame._row_lengths), np.sum(frame._column_widths))
    block_shape = app.get_block_shape(shape, dtype)
    rows = []
    for i in range(grid_shape[0]):
        cols = []
        for j in range(grid_shape[1]):
            curr_block_shape = (frame._row_lengths[i], frame._column_widths[j])
            part: PandasOnRayFramePartition = pd_parts[(i, j)]
            part.drain_call_queue()
            ba: BlockArray = BlockArray.from_oid(part.oid, curr_block_shape,
                                                 dtype, system)
            cols.append(ba)
        if grid_shape[1] == 1:
            row_ba: BlockArray = cols[0]
        else:
            row_ba: BlockArray = app.concatenate(
                cols, axis=1, axis_block_size=block_shape[1])
        rows.append(row_ba)
    result = app.concatenate(rows, axis=0, axis_block_size=block_shape[0])
    return result
예제 #2
0
 def predict(self, X: BlockArray):
     _check_array(X, True)
     r_oid = instance().cm.call_actor_method(self.actor, "predict",
                                             X.flattened_oids()[0])
     return BlockArray.from_oid(r_oid,
                                shape=(X.shape[0], ),
                                dtype=predict_dtype,
                                cm=instance().cm)
예제 #3
0
def train_test_split(*arrays,
                     test_size: Union[int, float] = None,
                     train_size: Union[int, float] = None,
                     random_state: Optional[Union[NumsRandomState,
                                                  int]] = None,
                     shuffle: bool = True,
                     stratify=None):
    # pylint: disable = protected-access
    updated_arrays = []
    for array in arrays:
        updated_arrays.append(_check_array(array))
    syskwargs = {
        "options": {
            "num_returns": 2 * len(updated_arrays)
        },
        "grid_entry": (0, ),
        "grid_shape": (1, ),
    }

    if random_state is None:
        rng_params = None
    else:
        if isinstance(random_state, int):
            # It's a seed.
            random_state: NumsRandomState = instance().random_state(
                random_state)
        rng_params = random_state._rng.new_block_rng_params()

    array_oids = [array.flattened_oids()[0] for array in updated_arrays]
    result_oids = instance().cm.call("train_test_split",
                                     *array_oids,
                                     rng_params=rng_params,
                                     test_size=test_size,
                                     train_size=train_size,
                                     shuffle=shuffle,
                                     stratify=stratify,
                                     syskwargs=syskwargs)
    # Optimize by computing this directly.
    shape_dtype_oids = [
        instance().cm.shape_dtype(r_oid,
                                  syskwargs={
                                      "grid_entry": (0, ),
                                      "grid_shape": (1, )
                                  }) for r_oid in result_oids
    ]
    shape_dtypes = instance().cm.get(shape_dtype_oids)
    results = []
    for i, r_oid in enumerate(result_oids):
        shape, dtype = shape_dtypes[i]
        results.append(
            BlockArray.from_oid(r_oid,
                                shape=shape,
                                dtype=dtype,
                                cm=instance().cm))
    return results
예제 #4
0
 def where(self, condition: BlockArray, x=None, y=None):
     result_oids = []
     shape_oids = []
     num_axes = max(1, len(condition.shape))
     # Stronger constraint than necessary, but no reason for anything stronger.
     if x is not None or y is not None:
         assert x is not None and y is not None
         assert condition.shape == x.shape == y.shape
         assert condition.block_shape == x.block_shape == y.block_shape
     for grid_entry in condition.grid.get_entry_iterator():
         block: Block = condition.blocks[grid_entry]
         block_slice_tuples = condition.grid.get_slice_tuples(grid_entry)
         roids = self.system.where(block.oid,
                                   x,
                                   y,
                                   block_slice_tuples,
                                   syskwargs={
                                       "grid_entry": grid_entry,
                                       "grid_shape":
                                       condition.grid.grid_shape,
                                       "options": {
                                           "num_returns": num_axes + 1
                                       }
                                   })
         block_oids, shape_oid = roids[:-1], roids[-1]
         shape_oids.append(shape_oid)
         result_oids.append(block_oids)
     shapes = self.system.get(shape_oids)
     result_shape = (np.sum(shapes), )
     if result_shape == (0, ):
         return (self.array(np.array([], dtype=np.int64),
                            block_shape=(0, )), )
     # Remove empty shapes.
     result_shape_pair = []
     for i, shape in enumerate(shapes):
         if np.sum(shape) > 0:
             result_shape_pair.append((result_oids[i], shape))
     result_block_shape = self.compute_block_shape(result_shape, np.int64)
     result_arrays = []
     for axis in range(num_axes):
         block_arrays = []
         for i in range(len(result_oids)):
             if shapes[i] == (0, ):
                 continue
             block_arrays.append(
                 BlockArray.from_oid(result_oids[i][axis], shapes[i],
                                     np.int64, self.system))
         if len(block_arrays) == 1:
             axis_result = block_arrays[0]
         else:
             axis_result = self.concatenate(block_arrays, 0,
                                            result_block_shape[0])
         result_arrays.append(axis_result)
     return tuple(result_arrays)
예제 #5
0
 def fit_transform(self, X: BlockArray, y: BlockArray = None):
     _check_array(X, True)
     if y is not None:
         _check_array(y, True)
         y = y.flattened_oids()[0]
     r_oid = instance().cm.call_actor_method(self.actor,
                                             "fit_transform",
                                             X.flattened_oids()[0], y)
     return BlockArray.from_oid(r_oid,
                                shape=X.shape,
                                dtype=float,
                                cm=instance().cm)
예제 #6
0
    def top_k(self,
              arr: BlockArray,
              k: int,
              largest=True) -> Tuple[BlockArray, BlockArray]:
        """Find the `k` largest or smallest elements of a BlockArray.

        If there are multiple kth elements that are equal in value, then no guarantees are made as
        to which ones are included in the top k.

        Args:
            arr: A BlockArray.
            k: Number of top elements to return.
            largest: Whether to return largest or smallest elements.

        Returns:
            A tuple containing two BlockArrays, (`values`, `indices`).
            values: Values of the top k elements, unsorted.
            indices: Indices of the top k elements, ordered by their corresponding values.
        """
        if arr.ndim != 1:
            raise NotImplementedError("Only 1D 'arr' is currently supported.")
        if k <= 0 or arr.size < k:
            raise IndexError(
                "'k' must be at least 1 and at most the size of 'arr'.")
        arr_oids = arr.flattened_oids()
        if largest:
            k_oid = self.quickselect(arr_oids, k - 1)
            k_val = BlockArray.from_oid(k_oid, (1, ), arr.dtype, self.cm)
            ie_indices = self.where(arr > k_val[0])[0]
        else:
            k_oid = self.quickselect(arr_oids, -k)
            k_val = BlockArray.from_oid(k_oid, (1, ), arr.dtype, self.cm)
            ie_indices = self.where(arr < k_val[0])[0]
        eq_indices = self.where(arr == k_val[0])[0]
        eq_indices_pad = eq_indices[:k - ie_indices.size]
        axis_block_size = self.compute_block_shape((k, ), int)[0]
        indices = self.concatenate([ie_indices, eq_indices_pad], 0,
                                   axis_block_size)
        return arr[indices], indices
예제 #7
0
    def median(self, arr: BlockArray) -> BlockArray:
        """Compute the median of a BlockArray.

        Args:
            a: A BlockArray.

        Returns:
            The median value.
        """
        if arr.ndim != 1:
            raise NotImplementedError("Only 1D 'arr' is currently supported.")

        a_oids = arr.flattened_oids()
        if arr.size % 2 == 1:
            m_oid = self.quickselect(a_oids, arr.size // 2)
            return BlockArray.from_oid(m_oid, (1, ), arr.dtype, self.cm)
        else:
            m0_oid = self.quickselect(a_oids, arr.size // 2 - 1)
            m0 = BlockArray.from_oid(m0_oid, (1, ), arr.dtype, self.cm)
            m1_oid = self.quickselect(a_oids, arr.size // 2)
            m1 = BlockArray.from_oid(m1_oid, (1, ), arr.dtype, self.cm)
            return (m0 + m1) / 2
예제 #8
0
    def quantile(
        self, arr: BlockArray, q: float, interpolation="linear", method="tdigest"
    ) -> BlockArray:
        """Compute the q-th quantile of the array elements.
        Args:
            arr: BlockArray.
            q: quantile to compute, which must be between 0 and 1 inclusive.
            interpolation: interpolation method to use when the desired quantile lies between two
            data points i < j.
            also see https://numpy.org/doc/1.20/reference/generated/numpy.quantile.html.
            also see https://docs.dask.org/en/latest/_modules/dask/array/percentile.html.


        Returns:
            Returns the q-th quantile of the array elements.
        """
        # pylint: disable = import-outside-toplevel, unused-import
        try:
            import crick
        except Exception as e:
            raise Exception(
                "Unable to import crick. \
                Install crick with command 'pip install cython; pip install crick'"
            ) from e

        if arr.ndim != 1:
            raise NotImplementedError("Only 1D 'arr' is currently supported.")
        if q < 0.0 or q > 1.0:
            raise ValueError("Quantiles must be in the range [0, 1]")
        assert interpolation == "linear"
        assert method == "tdigest"

        arr_oids = arr.flattened_oids()
        num_arrs = len(arr_oids)
        q = [q]
        t_oids = []

        for i, arr_oid in enumerate(arr_oids):
            syskwargs = {
                "grid_entry": (i,),
                "grid_shape": (num_arrs,),
                "options": {"num_returns": 1},
            }
            t_oids.append(self.cm.tdigest_chunk(arr_oid, syskwargs=syskwargs))

        p_oid = self.cm.percentiles_from_tdigest(q, *t_oids, syskwargs=syskwargs)
        return BlockArray.from_oid(p_oid, (1,), np.float64, self.cm)
예제 #9
0
 def array_compare(self, func_name, a: BlockArray, b: BlockArray, *args):
     assert a.shape == b.shape and a.block_shape == b.block_shape
     bool_list = []
     grid_shape = a.grid.grid_shape
     for grid_entry in a.grid.get_entry_iterator():
         a_block, b_block = a.blocks[grid_entry].oid, b.blocks[grid_entry].oid
         bool_list.append(
             self.cm.array_compare(
                 func_name,
                 a_block,
                 b_block,
                 args,
                 syskwargs={"grid_entry": grid_entry, "grid_shape": grid_shape},
             )
         )
     oid = self.cm.logical_and(
         *bool_list, syskwargs={"grid_entry": (0, 0), "grid_shape": (1, 1)}
     )
     return BlockArray.from_oid(oid, (), np.bool, self.cm)
예제 #10
0
 def score(self,
           X: BlockArray,
           y: BlockArray,
           sample_weight: BlockArray = None):
     _check_array(X, True)
     _check_array(y, True)
     if sample_weight is not None:
         _check_array(sample_weight, True)
         sample_weight = sample_weight.flattened_oids()[0]
     r_oid = instance().cm.call_actor_method(
         self.actor,
         "score",
         X.flattened_oids()[0],
         y.flattened_oids()[0],
         sample_weight,
     )
     return BlockArray.from_oid(r_oid,
                                shape=(),
                                dtype=float,
                                cm=instance().cm)
예제 #11
0
 def allclose(self, a: BlockArray, b: BlockArray, rtol=1.e-5, atol=1.e-8):
     assert a.shape == b.shape and a.block_shape == b.block_shape
     bool_list = []
     grid_shape = a.grid.grid_shape
     for grid_entry in a.grid.get_entry_iterator():
         a_block, b_block = a.blocks[grid_entry].oid, b.blocks[
             grid_entry].oid
         bool_list.append(
             self._system.allclose(a_block,
                                   b_block,
                                   rtol,
                                   atol,
                                   syskwargs={
                                       "grid_entry": grid_entry,
                                       "grid_shape": grid_shape
                                   }))
     oid = self._system.logical_and(*bool_list,
                                    syskwargs={
                                        "grid_entry": (0, 0),
                                        "grid_shape": (1, 1)
                                    })
     return BlockArray.from_oid(oid, (), np.bool, self._system)