def from_modin(df): # pylint: disable = import-outside-toplevel, protected-access, unidiomatic-typecheck import numpy as np try: from modin.pandas.dataframe import DataFrame from modin.engines.ray.pandas_on_ray.frame.data import PandasOnRayFrame from modin.engines.ray.pandas_on_ray.frame.partition import ( PandasOnRayFramePartition, ) except Exception as e: raise Exception( "Unable to import modin. Install modin with command 'pip install modin'" ) from e assert isinstance( df, DataFrame), "Unexpected dataframe type %s" % str(type(df)) assert isinstance(df._query_compiler._modin_frame, PandasOnRayFrame), "Unexpected dataframe type %s" % str( type(df._query_compiler._modin_frame)) frame: PandasOnRayFrame = df._query_compiler._modin_frame app: ArrayApplication = _instance() system = app.cm # Make sure the partitions are numeric. dtype = frame.dtypes[0] if not array_utils.is_supported(dtype, type_test=True): raise TypeError("%s is not supported." % str(dtype)) for dt in frame.dtypes: if dt != dtype: raise TypeError("Mixed types are not supported (%s != %s).") dtype = np.__getattribute__(str(dtype)) # Convert from Pandas to NumPy. pd_parts = frame._partition_mgr_cls.map_partitions(frame._partitions, lambda df: np.array(df)) grid_shape = len(frame._row_lengths), len(frame._column_widths) shape = (np.sum(frame._row_lengths), np.sum(frame._column_widths)) block_shape = app.get_block_shape(shape, dtype) rows = [] for i in range(grid_shape[0]): cols = [] for j in range(grid_shape[1]): curr_block_shape = (frame._row_lengths[i], frame._column_widths[j]) part: PandasOnRayFramePartition = pd_parts[(i, j)] part.drain_call_queue() ba: BlockArray = BlockArray.from_oid(part.oid, curr_block_shape, dtype, system) cols.append(ba) if grid_shape[1] == 1: row_ba: BlockArray = cols[0] else: row_ba: BlockArray = app.concatenate( cols, axis=1, axis_block_size=block_shape[1]) rows.append(row_ba) result = app.concatenate(rows, axis=0, axis_block_size=block_shape[0]) return result
def predict(self, X: BlockArray): _check_array(X, True) r_oid = instance().cm.call_actor_method(self.actor, "predict", X.flattened_oids()[0]) return BlockArray.from_oid(r_oid, shape=(X.shape[0], ), dtype=predict_dtype, cm=instance().cm)
def train_test_split(*arrays, test_size: Union[int, float] = None, train_size: Union[int, float] = None, random_state: Optional[Union[NumsRandomState, int]] = None, shuffle: bool = True, stratify=None): # pylint: disable = protected-access updated_arrays = [] for array in arrays: updated_arrays.append(_check_array(array)) syskwargs = { "options": { "num_returns": 2 * len(updated_arrays) }, "grid_entry": (0, ), "grid_shape": (1, ), } if random_state is None: rng_params = None else: if isinstance(random_state, int): # It's a seed. random_state: NumsRandomState = instance().random_state( random_state) rng_params = random_state._rng.new_block_rng_params() array_oids = [array.flattened_oids()[0] for array in updated_arrays] result_oids = instance().cm.call("train_test_split", *array_oids, rng_params=rng_params, test_size=test_size, train_size=train_size, shuffle=shuffle, stratify=stratify, syskwargs=syskwargs) # Optimize by computing this directly. shape_dtype_oids = [ instance().cm.shape_dtype(r_oid, syskwargs={ "grid_entry": (0, ), "grid_shape": (1, ) }) for r_oid in result_oids ] shape_dtypes = instance().cm.get(shape_dtype_oids) results = [] for i, r_oid in enumerate(result_oids): shape, dtype = shape_dtypes[i] results.append( BlockArray.from_oid(r_oid, shape=shape, dtype=dtype, cm=instance().cm)) return results
def where(self, condition: BlockArray, x=None, y=None): result_oids = [] shape_oids = [] num_axes = max(1, len(condition.shape)) # Stronger constraint than necessary, but no reason for anything stronger. if x is not None or y is not None: assert x is not None and y is not None assert condition.shape == x.shape == y.shape assert condition.block_shape == x.block_shape == y.block_shape for grid_entry in condition.grid.get_entry_iterator(): block: Block = condition.blocks[grid_entry] block_slice_tuples = condition.grid.get_slice_tuples(grid_entry) roids = self.system.where(block.oid, x, y, block_slice_tuples, syskwargs={ "grid_entry": grid_entry, "grid_shape": condition.grid.grid_shape, "options": { "num_returns": num_axes + 1 } }) block_oids, shape_oid = roids[:-1], roids[-1] shape_oids.append(shape_oid) result_oids.append(block_oids) shapes = self.system.get(shape_oids) result_shape = (np.sum(shapes), ) if result_shape == (0, ): return (self.array(np.array([], dtype=np.int64), block_shape=(0, )), ) # Remove empty shapes. result_shape_pair = [] for i, shape in enumerate(shapes): if np.sum(shape) > 0: result_shape_pair.append((result_oids[i], shape)) result_block_shape = self.compute_block_shape(result_shape, np.int64) result_arrays = [] for axis in range(num_axes): block_arrays = [] for i in range(len(result_oids)): if shapes[i] == (0, ): continue block_arrays.append( BlockArray.from_oid(result_oids[i][axis], shapes[i], np.int64, self.system)) if len(block_arrays) == 1: axis_result = block_arrays[0] else: axis_result = self.concatenate(block_arrays, 0, result_block_shape[0]) result_arrays.append(axis_result) return tuple(result_arrays)
def fit_transform(self, X: BlockArray, y: BlockArray = None): _check_array(X, True) if y is not None: _check_array(y, True) y = y.flattened_oids()[0] r_oid = instance().cm.call_actor_method(self.actor, "fit_transform", X.flattened_oids()[0], y) return BlockArray.from_oid(r_oid, shape=X.shape, dtype=float, cm=instance().cm)
def top_k(self, arr: BlockArray, k: int, largest=True) -> Tuple[BlockArray, BlockArray]: """Find the `k` largest or smallest elements of a BlockArray. If there are multiple kth elements that are equal in value, then no guarantees are made as to which ones are included in the top k. Args: arr: A BlockArray. k: Number of top elements to return. largest: Whether to return largest or smallest elements. Returns: A tuple containing two BlockArrays, (`values`, `indices`). values: Values of the top k elements, unsorted. indices: Indices of the top k elements, ordered by their corresponding values. """ if arr.ndim != 1: raise NotImplementedError("Only 1D 'arr' is currently supported.") if k <= 0 or arr.size < k: raise IndexError( "'k' must be at least 1 and at most the size of 'arr'.") arr_oids = arr.flattened_oids() if largest: k_oid = self.quickselect(arr_oids, k - 1) k_val = BlockArray.from_oid(k_oid, (1, ), arr.dtype, self.cm) ie_indices = self.where(arr > k_val[0])[0] else: k_oid = self.quickselect(arr_oids, -k) k_val = BlockArray.from_oid(k_oid, (1, ), arr.dtype, self.cm) ie_indices = self.where(arr < k_val[0])[0] eq_indices = self.where(arr == k_val[0])[0] eq_indices_pad = eq_indices[:k - ie_indices.size] axis_block_size = self.compute_block_shape((k, ), int)[0] indices = self.concatenate([ie_indices, eq_indices_pad], 0, axis_block_size) return arr[indices], indices
def median(self, arr: BlockArray) -> BlockArray: """Compute the median of a BlockArray. Args: a: A BlockArray. Returns: The median value. """ if arr.ndim != 1: raise NotImplementedError("Only 1D 'arr' is currently supported.") a_oids = arr.flattened_oids() if arr.size % 2 == 1: m_oid = self.quickselect(a_oids, arr.size // 2) return BlockArray.from_oid(m_oid, (1, ), arr.dtype, self.cm) else: m0_oid = self.quickselect(a_oids, arr.size // 2 - 1) m0 = BlockArray.from_oid(m0_oid, (1, ), arr.dtype, self.cm) m1_oid = self.quickselect(a_oids, arr.size // 2) m1 = BlockArray.from_oid(m1_oid, (1, ), arr.dtype, self.cm) return (m0 + m1) / 2
def quantile( self, arr: BlockArray, q: float, interpolation="linear", method="tdigest" ) -> BlockArray: """Compute the q-th quantile of the array elements. Args: arr: BlockArray. q: quantile to compute, which must be between 0 and 1 inclusive. interpolation: interpolation method to use when the desired quantile lies between two data points i < j. also see https://numpy.org/doc/1.20/reference/generated/numpy.quantile.html. also see https://docs.dask.org/en/latest/_modules/dask/array/percentile.html. Returns: Returns the q-th quantile of the array elements. """ # pylint: disable = import-outside-toplevel, unused-import try: import crick except Exception as e: raise Exception( "Unable to import crick. \ Install crick with command 'pip install cython; pip install crick'" ) from e if arr.ndim != 1: raise NotImplementedError("Only 1D 'arr' is currently supported.") if q < 0.0 or q > 1.0: raise ValueError("Quantiles must be in the range [0, 1]") assert interpolation == "linear" assert method == "tdigest" arr_oids = arr.flattened_oids() num_arrs = len(arr_oids) q = [q] t_oids = [] for i, arr_oid in enumerate(arr_oids): syskwargs = { "grid_entry": (i,), "grid_shape": (num_arrs,), "options": {"num_returns": 1}, } t_oids.append(self.cm.tdigest_chunk(arr_oid, syskwargs=syskwargs)) p_oid = self.cm.percentiles_from_tdigest(q, *t_oids, syskwargs=syskwargs) return BlockArray.from_oid(p_oid, (1,), np.float64, self.cm)
def array_compare(self, func_name, a: BlockArray, b: BlockArray, *args): assert a.shape == b.shape and a.block_shape == b.block_shape bool_list = [] grid_shape = a.grid.grid_shape for grid_entry in a.grid.get_entry_iterator(): a_block, b_block = a.blocks[grid_entry].oid, b.blocks[grid_entry].oid bool_list.append( self.cm.array_compare( func_name, a_block, b_block, args, syskwargs={"grid_entry": grid_entry, "grid_shape": grid_shape}, ) ) oid = self.cm.logical_and( *bool_list, syskwargs={"grid_entry": (0, 0), "grid_shape": (1, 1)} ) return BlockArray.from_oid(oid, (), np.bool, self.cm)
def score(self, X: BlockArray, y: BlockArray, sample_weight: BlockArray = None): _check_array(X, True) _check_array(y, True) if sample_weight is not None: _check_array(sample_weight, True) sample_weight = sample_weight.flattened_oids()[0] r_oid = instance().cm.call_actor_method( self.actor, "score", X.flattened_oids()[0], y.flattened_oids()[0], sample_weight, ) return BlockArray.from_oid(r_oid, shape=(), dtype=float, cm=instance().cm)
def allclose(self, a: BlockArray, b: BlockArray, rtol=1.e-5, atol=1.e-8): assert a.shape == b.shape and a.block_shape == b.block_shape bool_list = [] grid_shape = a.grid.grid_shape for grid_entry in a.grid.get_entry_iterator(): a_block, b_block = a.blocks[grid_entry].oid, b.blocks[ grid_entry].oid bool_list.append( self._system.allclose(a_block, b_block, rtol, atol, syskwargs={ "grid_entry": grid_entry, "grid_shape": grid_shape })) oid = self._system.logical_and(*bool_list, syskwargs={ "grid_entry": (0, 0), "grid_shape": (1, 1) }) return BlockArray.from_oid(oid, (), np.bool, self._system)