def _handle_boolean_masking(self, row_loc, col_loc): """ Retrieve dataset according to the boolean mask for rows and an indexer for columns. In comparison with the regular ``loc/iloc.__getitem__`` flow this method efficiently masks rows with a Modin Series boolean mask without materializing it (if the selected execution implements such masking). Parameters ---------- row_loc : modin.pandas.Series of bool dtype Boolean mask to index rows with. col_loc : object An indexer along column axis. Returns ------- modin.pandas.DataFrame or modin.pandas.Series Located dataset. """ ErrorMessage.catch_bugs_and_request_email( failure_condition=not isinstance(row_loc, Series), extra_log=f"Only ``modin.pandas.Series`` boolean masks are acceptable, got: {type(row_loc)}", ) masked_df = self.df.__constructor__( query_compiler=self.qc.getitem_array(row_loc._query_compiler) ) # Passing `slice(None)` as a row indexer since we've just applied it return type(self)(masked_df)[(slice(None), col_loc)]
def _get_blocks_containing_index(self, axis, index): """Convert a global index to a block index and local index. Note: This method is primarily used to convert a global index into a partition index (along the axis provided) and local index (useful for `iloc` or similar operations. Args: axis: The axis along which to get the indices (0 - columns, 1 - rows) index: The global index to convert. Returns: A tuple containing (block index and internal index). """ if not axis: ErrorMessage.catch_bugs_and_request_email( index > sum(self.block_widths)) cumulative_column_widths = np.array(self.block_widths).cumsum() block_idx = int(np.digitize(index, cumulative_column_widths)) if block_idx == len(cumulative_column_widths): block_idx -= 1 # Compute the internal index based on the previous lengths. This # is a global index, so we must subtract the lengths first. internal_idx = (index if not block_idx else index - cumulative_column_widths[block_idx - 1]) else: ErrorMessage.catch_bugs_and_request_email( index > sum(self.block_lengths)) cumulative_row_lengths = np.array(self.block_lengths).cumsum() block_idx = int(np.digitize(index, cumulative_row_lengths)) # See note above about internal index internal_idx = (index if not block_idx else index - cumulative_row_lengths[block_idx - 1]) return block_idx, internal_idx
def get_indices(cls, axis, partitions, index_func=None): """ This gets the internal indices stored in the partitions. Parameters ---------- axis : 0 or 1 This axis to extract the labels (0 - index, 1 - columns). partitions : NumPy array The array of partitions from which need to extract the labels. index_func : callable The function to be used to extract the function. Returns ------- Index A Pandas Index object. Notes ----- These are the global indices of the object. This is mostly useful when you have deleted rows/columns internally, but do not know which ones were deleted. """ ErrorMessage.catch_bugs_and_request_email(not callable(index_func)) func = cls.preprocess_func(index_func) if axis == 0: # We grab the first column of blocks and extract the indices new_idx = ([idx.apply(func).oid for idx in partitions.T[0]] if len(partitions.T) else []) else: new_idx = ([idx.apply(func).oid for idx in partitions[0]] if len(partitions) else []) new_idx = ray.get(new_idx) return new_idx[0].append(new_idx[1:]) if len(new_idx) else new_idx
def get_indices(cls, axis, partitions, index_func=None): """ Get the internal indices stored in the partitions. Parameters ---------- axis : {0, 1} Axis to extract the labels over. partitions : np.ndarray NumPy array with ``PandasFramePartition``-s. index_func : callable, default: None The function to be used to extract the indices. Returns ------- pandas.Index A ``pandas.Index`` object. Notes ----- These are the global indices of the object. This is mostly useful when you have deleted rows/columns internally, but do not know which ones were deleted. """ ErrorMessage.catch_bugs_and_request_email(not callable(index_func)) func = cls.preprocess_func(index_func) if axis == 0: # We grab the first column of blocks and extract the indices new_idx = ([idx.apply(func).oid for idx in partitions.T[0]] if len(partitions.T) else []) else: new_idx = ([idx.apply(func).oid for idx in partitions[0]] if len(partitions) else []) new_idx = ray.get(new_idx) return new_idx[0].append(new_idx[1:]) if len(new_idx) else new_idx
def _internal_by(self): """ Get only those components of 'by' that are column labels of the source frame. Returns ------- tuple of labels """ if self._internal_by_cache is not no_default: return self._internal_by_cache internal_by = tuple() if self._drop: if is_list_like(self._by): internal_by = tuple(by for by in self._by if isinstance(by, str)) else: ErrorMessage.catch_bugs_and_request_email( failure_condition=not isinstance(self._by, BaseQueryCompiler), extra_log= f"When 'drop' is True, 'by' must be either list-like or a QueryCompiler, met: {type(self._by)}.", ) internal_by = tuple(self._by.columns) self._internal_by_cache = internal_by return internal_by
def _index_grouped(self): if self._index_grouped_cache is None: if hasattr(self._by, "columns") and len(self._by.columns) > 1: by = list(self._by.columns) is_multi_by = True else: by = self._by is_multi_by = self._is_multi_by if is_multi_by: # Because we are doing a collect (to_pandas) here and then groupby, we # end up using pandas implementation. Add the warning so the user is # aware. ErrorMessage.catch_bugs_and_request_email(self._axis == 1) ErrorMessage.default_to_pandas("Groupby with multiple columns") self._index_grouped_cache = { k: v.index for k, v in self._df._query_compiler.getitem_column_array(by) .to_pandas() .groupby(by=by) } else: if isinstance(self._by, type(self._query_compiler)): by = self._by.to_pandas().squeeze() else: by = self._by if self._axis == 0: self._index_grouped_cache = self._index.groupby(by) else: self._index_grouped_cache = self._columns.groupby(by) return self._index_grouped_cache
def get_indices(cls, axis, partitions, index_func=None): """This gets the internal indices stored in the partitions. Note: These are the global indices of the object. This is mostly useful when you have deleted rows/columns internally, but do not know which ones were deleted. Args: axis: This axis to extract the labels. (0 - index, 1 - columns). index_func: The function to be used to extract the function. Returns: A Pandas Index object. """ ErrorMessage.catch_bugs_and_request_email(not callable(index_func)) func = cls.preprocess_func(index_func) if axis == 0: new_idx = ( [idx.apply(func).get() for idx in partitions.T[0]] if len(partitions.T) else [] ) else: new_idx = ( [idx.apply(func).get() for idx in partitions[0]] if len(partitions) else [] ) # TODO FIX INFORMATION LEAK!!!!1!!1!! return new_idx[0].append(new_idx[1:]) if len(new_idx) else new_idx
def to_pandas(cls, partitions): """Convert this object into a Pandas DataFrame from the partitions. Returns: A Pandas DataFrame """ retrieved_objects = [[obj.to_pandas() for obj in part] for part in partitions] if all( isinstance(part, pandas.Series) for row in retrieved_objects for part in row ): axis = 0 elif all( isinstance(part, pandas.DataFrame) for row in retrieved_objects for part in row ): axis = 1 else: ErrorMessage.catch_bugs_and_request_email(True) df_rows = [ pandas.concat([part for part in row], axis=axis) for row in retrieved_objects if not all(part.empty for part in row) ] if len(df_rows) == 0: return pandas.DataFrame() else: return cls.concatenate(df_rows)
def to_pandas(cls, partitions): """ Convert NumPy array of PandasDataframePartition to pandas DataFrame. Parameters ---------- partitions : np.ndarray NumPy array of PandasDataframePartition. Returns ------- pandas.DataFrame A pandas DataFrame """ retrieved_objects = [[obj.to_pandas() for obj in part] for part in partitions] if all( isinstance(part, pandas.Series) for row in retrieved_objects for part in row): axis = 0 elif all( isinstance(part, pandas.DataFrame) for row in retrieved_objects for part in row): axis = 1 else: ErrorMessage.catch_bugs_and_request_email(True) df_rows = [ pandas.concat([part for part in row], axis=axis) for row in retrieved_objects if not all(part.empty for part in row) ] if len(df_rows) == 0: return pandas.DataFrame() else: return concatenate(df_rows)
def get_indices(cls, axis, partitions, index_func=None): """ Get the internal indices stored in the partitions. Parameters ---------- axis : {0, 1} Axis to extract the labels over. partitions : np.ndarray NumPy array with PandasDataframePartition's. index_func : callable, default: None The function to be used to extract the indices. Returns ------- pandas.Index A pandas Index object. Notes ----- These are the global indices of the object. This is mostly useful when you have deleted rows/columns internally, but do not know which ones were deleted. """ ErrorMessage.catch_bugs_and_request_email(not callable(index_func)) func = cls.preprocess_func(index_func) if axis == 0: new_idx = ([idx.apply(func) for idx in partitions.T[0]] if len(partitions.T) else []) else: new_idx = ([idx.apply(func) for idx in partitions[0]] if len(partitions) else []) new_idx = cls.get_objects_from_partitions(new_idx) # TODO FIX INFORMATION LEAK!!!!1!!1!! return new_idx[0].append(new_idx[1:]) if len(new_idx) else new_idx
def mask(self, row_indices=None, col_indices=None): ErrorMessage.catch_bugs_and_request_email(row_indices is None and col_indices is None) if row_indices is not None: row_partitions_list = self._get_dict_of_block_index(1, row_indices, ordered=True) else: row_partitions_list = [(i, range(self.block_lengths[i])) for i in range(len(self.block_lengths))] if col_indices is not None: col_partitions_list = self._get_dict_of_block_index(0, col_indices, ordered=True) else: col_partitions_list = [(i, range(self.block_widths[i])) for i in range(len(self.block_widths))] return self.__constructor__( np.array([[ self.partitions[row_idx][col_idx].mask(row_internal_indices, col_internal_indices) for col_idx, col_internal_indices in col_partitions_list if len(col_internal_indices) > 0 ] for row_idx, row_internal_indices in row_partitions_list if len(row_internal_indices) > 0]))
def get_indices(cls, axis, partitions, index_func): """This gets the internal indices stored in the partitions. Note: These are the global indices of the object. This is mostly useful when you have deleted rows/columns internally, but do not know which ones were deleted. Args: axis: This axis to extract the labels. (0 - index, 1 - columns). index_func: The function to be used to extract the function. old_blocks: An optional previous object that this object was created from. This is used to compute the correct offsets. Returns: A Pandas Index object. """ client = get_client() ErrorMessage.catch_bugs_and_request_email(not callable(index_func)) func = cls.preprocess_func(index_func) if axis == 0: # We grab the first column of blocks and extract the indices new_idx = ([idx.apply(func).future for idx in partitions.T[0]] if len(partitions.T) else []) else: new_idx = ([idx.apply(func).future for idx in partitions[0]] if len(partitions) else []) new_idx = client.gather(new_idx) return new_idx[0].append(new_idx[1:]) if len(new_idx) else new_idx
def _apply_index_objs(self, axis=None): """Lazily applies the index object (Index or Columns) to the partitions. Args: axis: The axis to apply to, None applies to both axes. Returns: A new 2D array of partitions that have the index assignment added to the call queue. """ self._filter_empties() if axis is None or axis == 0: cum_row_lengths = np.cumsum([0] + self._row_lengths) if axis is None or axis == 1: cum_col_widths = np.cumsum([0] + self._column_widths) if axis is None: def apply_idx_objs(df, idx, cols): df.index, df.columns = idx, cols return df self._partitions = np.array([[ self._partitions[i][j].add_to_apply_calls( apply_idx_objs, idx=self.index[slice(cum_row_lengths[i], cum_row_lengths[i + 1])], cols=self.columns[slice(cum_col_widths[j], cum_col_widths[j + 1])], ) for j in range(len(self._partitions[i])) ] for i in range(len(self._partitions))]) elif axis == 0: def apply_idx_objs(df, idx): df.index = idx return df self._partitions = np.array([[ self._partitions[i][j].add_to_apply_calls( apply_idx_objs, idx=self.index[slice(cum_row_lengths[i], cum_row_lengths[i + 1])], ) for j in range(len(self._partitions[i])) ] for i in range(len(self._partitions))]) elif axis == 1: def apply_idx_objs(df, cols): df.columns = cols return df self._partitions = np.array([[ self._partitions[i][j].add_to_apply_calls( apply_idx_objs, cols=self.columns[slice(cum_col_widths[j], cum_col_widths[j + 1])], ) for j in range(len(self._partitions[i])) ] for i in range(len(self._partitions))]) ErrorMessage.catch_bugs_and_request_email(axis is not None and axis not in [0, 1])
def get_indices(self, axis=0, index_func=None, old_blocks=None): """This gets the internal indices stored in the partitions. Note: These are the global indices of the object. This is mostly useful when you have deleted rows/columns internally, but do not know which ones were deleted. Args: axis: This axis to extract the labels. (0 - index, 1 - columns). index_func: The function to be used to extract the function. old_blocks: An optional previous object that this object was created from. This is used to compute the correct offsets. Returns: A Pandas Index object. """ ErrorMessage.catch_bugs_and_request_email(not callable(index_func)) if axis == 0: func = self.preprocess_func(index_func) # We grab the first column of blocks and extract the indices new_indices = [ idx.apply(func).get() for idx in self.partitions.T[0] ] # This is important because sometimes we have resized the data. The new # sizes will not be valid if we are trying to compute the index on a # new object that has a different length. if old_blocks is not None: cumulative_block_lengths = np.array( old_blocks.block_lengths).cumsum() else: cumulative_block_lengths = np.array( self.block_lengths).cumsum() else: func = self.preprocess_func(index_func) new_indices = [idx.apply(func).get() for idx in self.partitions[0]] if old_blocks is not None: cumulative_block_lengths = np.array( old_blocks.block_widths).cumsum() else: cumulative_block_lengths = np.array(self.block_widths).cumsum() full_indices = new_indices[0] if old_blocks is not None: for i in range(len(new_indices)): # If the length is 0 there is nothing to append. if i == 0 or len(new_indices[i]) == 0: continue # The try-except here is intended to catch issues where we are # trying to get a string index out of the internal index. try: append_val = new_indices[i] + cumulative_block_lengths[i - 1] except TypeError: append_val = new_indices[i] full_indices = full_indices.append(append_val) else: full_indices = full_indices.append(new_indices[1:]) return full_indices
def _index_grouped(self): """ Implement [METHOD_NAME]. TODO: Add more details for this docstring template. Parameters ---------- What arguments does this function have. [ PARAMETER_NAME: PARAMETERS TYPES Description. ] Returns ------- What this returns (if anything) """ if self._index_grouped_cache is None: if hasattr(self._by, "columns") and len(self._by.columns) > 1: by = list(self._by.columns) is_multi_by = True else: by = self._by is_multi_by = self._is_multi_by if is_multi_by: # Because we are doing a collect (to_pandas) here and then groupby, we # end up using pandas implementation. Add the warning so the user is # aware. ErrorMessage.catch_bugs_and_request_email(self._axis == 1) ErrorMessage.default_to_pandas("Groupby with multiple columns") if isinstance(by, list) and all( hashable(o) and ( o in self._df or o in self._df._query_compiler.get_index_names(self._axis) ) for o in by ): pandas_df = self._df._query_compiler.getitem_column_array( by ).to_pandas() else: by = try_cast_to_pandas(by, squeeze=True) pandas_df = self._df._to_pandas() self._index_grouped_cache = pandas_df.groupby(by=by).groups else: if isinstance(self._by, type(self._query_compiler)): by = self._by.to_pandas().squeeze().values else: by = self._by if self._axis == 0: self._index_grouped_cache = self._index.groupby(by) else: self._index_grouped_cache = self._columns.groupby(by) return self._index_grouped_cache
def to_numpy(self): """Converts Modin DataFrame to NumPy Array. Returns: NumPy Array of the QueryCompiler. """ arr = self._modin_frame.to_numpy() ErrorMessage.catch_bugs_and_request_email( len(arr) != len(self.index) or len(arr[0]) != len(self.columns)) return arr
def __getitem__(self, row_lookup, col_lookup, ndim): """ Retrieve dataset according to `row_lookup` and `col_lookup`. Parameters ---------- row_lookup : slice(None), range or np.ndarray The global row index to retrieve data from. col_lookup : slice(None), range or np.ndarray The global col index to retrieve data from. ndim : {0, 1, 2} Number of dimensions in dataset to be retrieved. Returns ------- modin.pandas.DataFrame or modin.pandas.Series Located dataset. Notes ----- Usage of `slice(None)` as a lookup is a hack to pass information about full-axis grab without computing actual indices that triggers lazy computations. Ideally, this API should get rid of using slices as indexers and either use a common ``Indexer`` object or range and ``np.ndarray`` only. """ if isinstance(row_lookup, slice): ErrorMessage.catch_bugs_and_request_email( failure_condition=row_lookup != slice(None), extra_log=f"Only None-slices are acceptable as a slice argument in masking, got: {row_lookup}", ) row_lookup = None if isinstance(col_lookup, slice): ErrorMessage.catch_bugs_and_request_email( failure_condition=col_lookup != slice(None), extra_log=f"Only None-slices are acceptable as a slice argument in masking, got: {col_lookup}", ) col_lookup = None qc_view = self.qc.view(row_lookup, col_lookup) if ndim == 2: return self.df.__constructor__(query_compiler=qc_view) if isinstance(self.df, Series) and not self.row_scalar: return self.df.__constructor__(query_compiler=qc_view) if isinstance(self.df, Series): axis = 0 elif ndim == 0: axis = None else: axis = ( None if self.col_scalar and self.row_scalar else 1 if self.col_scalar else 0 ) return self.df.__constructor__(query_compiler=qc_view).squeeze(axis=axis)
def _apply_index_objs(self, axis=None): """Eagerly applies the index object (Index or Columns) to the partitions. Args: axis: The axis to apply to, None applies to both axes. Returns ------- A new 2D array of partitions that have the index assignment added to the call queue. """ ErrorMessage.catch_bugs_and_request_email(axis is not None and axis not in [0, 1]) cum_row_lengths = np.cumsum([0] + self._row_lengths) cum_col_widths = np.cumsum([0] + self._column_widths) def apply_idx_objs(df, idx, cols, axis): # cudf does not support set_axis. It only supports rename with 1-to-1 mapping. # Therefore, we need to create the dictionary that have the relationship between # current index and new ones. idx = {df.index[i]: idx[i] for i in range(len(idx))} cols = {df.index[i]: cols[i] for i in range(len(cols))} if axis == 0: return df.rename(index=idx) elif axis == 1: return df.rename(columns=cols) else: return df.rename(index=idx, columns=cols) keys = np.array([[ self._partitions[i][j].apply( apply_idx_objs, idx=self.index[slice(cum_row_lengths[i], cum_row_lengths[i + 1])], cols=self.columns[slice(cum_col_widths[j], cum_col_widths[j + 1])], axis=axis, ) for j in range(len(self._partitions[i])) ] for i in range(len(self._partitions))]) self._partitions = np.array([[ cuDFOnRayFramePartition( self._partitions[i][j].get_gpu_manager(), keys[i][j], self._partitions[i][j]._length_cache, self._partitions[i][j]._width_cache, ) for j in range(len(keys[i])) ] for i in range(len(keys))])
def synchronize_labels(self, axis=None): """ Synchronize labels by applying the index object (Index or Columns) to the partitions eagerly. Parameters ---------- axis : {0, 1, None}, default: None The axis to apply to. If None, it applies to both axes. """ ErrorMessage.catch_bugs_and_request_email(axis is not None and axis not in [0, 1]) cum_row_lengths = np.cumsum([0] + self._row_lengths) cum_col_widths = np.cumsum([0] + self._column_widths) def apply_idx_objs(df, idx, cols, axis): # cudf does not support set_axis. It only supports rename with 1-to-1 mapping. # Therefore, we need to create the dictionary that have the relationship between # current index and new ones. idx = {df.index[i]: idx[i] for i in range(len(idx))} cols = {df.index[i]: cols[i] for i in range(len(cols))} if axis == 0: return df.rename(index=idx) elif axis == 1: return df.rename(columns=cols) else: return df.rename(index=idx, columns=cols) keys = np.array([[ self._partitions[i][j].apply( apply_idx_objs, idx=self.index[slice(cum_row_lengths[i], cum_row_lengths[i + 1])], cols=self.columns[slice(cum_col_widths[j], cum_col_widths[j + 1])], axis=axis, ) for j in range(len(self._partitions[i])) ] for i in range(len(self._partitions))]) self._partitions = np.array([[ cuDFOnRayDataframePartition( self._partitions[i][j].get_gpu_manager(), keys[i][j], self._partitions[i][j]._length_cache, self._partitions[i][j]._width_cache, ) for j in range(len(keys[i])) ] for i in range(len(keys))])
def to_pandas(self, is_transposed=False): """Convert this object into a Pandas DataFrame from the partitions. Args: is_transposed: A flag for telling this object that the external representation is transposed, but not the internal. Returns: A Pandas DataFrame """ # In the case this is transposed, it is easier to just temporarily # transpose back then transpose after the conversion. The performance # is the same as if we individually transposed the blocks and # concatenated them, but the code is much smaller. if is_transposed: return self.transpose().to_pandas(False).T else: retrieved_objects = [ [set_indices_for_pandas_concat(obj.to_pandas()) for obj in part] for part in self.partitions ] if all( isinstance(part, pandas.Series) for row in retrieved_objects for part in row ): axis = 0 elif all( isinstance(part, pandas.DataFrame) for row in retrieved_objects for part in row ): axis = 1 else: ErrorMessage.catch_bugs_and_request_email(True) df_rows = [ pandas.concat([part for part in row], axis=axis) for row in retrieved_objects if not all(part.empty for part in row) ] if len(df_rows) == 0: return pandas.DataFrame() else: return pandas.concat(df_rows)
def __init__( self, partitions, index, columns, row_lengths=None, column_widths=None, dtypes=None, ): """Initialize a dataframe. Args: partitions: A 2D numpy array of partitions. Must contain partition objects. index: The index object for the dataframe. Converts to a pandas.Index. columns: The columns object for the dataframe. Converts to a pandas.Index. row_lengths: (optional) The lengths of each partition in the rows. The "height" of each of the block partitions. Is computed if not provided. column_widths: (optional) The width of each partition in the columns. The "width" of each of the block partitions. Is computed if not provided. dtypes: (optional) The data types for the dataframe. """ self._partitions = partitions self._index_cache = ensure_index(index) self._columns_cache = ensure_index(columns) if row_lengths is not None and len(self.index) > 0: ErrorMessage.catch_bugs_and_request_email( sum(row_lengths) != len(self._index_cache), "Row lengths: {} != {}".format( sum(row_lengths), len(self._index_cache) ), ) self._row_lengths_cache = row_lengths if column_widths is not None and len(self.columns) > 0: ErrorMessage.catch_bugs_and_request_email( sum(column_widths) != len(self._columns_cache), "Column widths: {} != {}".format( sum(column_widths), len(self._columns_cache) ), ) self._column_widths_cache = column_widths self._dtypes = dtypes self._filter_empties()
def to_pandas(self): """Converts Modin DataFrame to Pandas DataFrame. Returns: Pandas DataFrame of the DataManager. """ df = self.data.to_pandas(is_transposed=self._is_transposed) if df.empty: dtype_dict = { col_name: pandas.Series(dtype=self.dtypes[col_name]) for col_name in self.columns } df = pandas.DataFrame(dtype_dict, self.index) else: ErrorMessage.catch_bugs_and_request_email( len(df.index) != len(self.index) or len(df.columns) != len(self.columns) ) df.index = self.index df.columns = self.columns return df
def get_chunks( self, n_chunks: Optional[int] = None ) -> Iterable["OmnisciProtocolDataframe"]: """ Return an iterator yielding the chunks. If `n_chunks` is not specified, yields the chunks that the data is stored underneath. If given, `n_chunks` must be a multiple of ``self.num_chunks()``, meaning that each physical chunk is going to be split into ``n_chunks // self.num_chunks()`` virtual chunks, that are backed by the same physical buffers but have different ``.offset`` values. Parameters ---------- n_chunks : int, optional Number of chunks to yield. Returns ------- Iterable["OmnisciProtocolDataframe"] An iterator yielding ``OmnisciProtocolDataframe`` objects. Raises ------ ``RuntimeError`` if ``n_chunks`` is not a multiple of ``self.num_chunks()`` or ``n_chunks`` is greater than ``self.num_rows()``. Notes ----- There is a special casing in handling variable-sized columns (i.e. strings) when virtually chunked. In order to make the offsets buffer be valid for each virtual chunk, the data buffer shouldn't be chunked at all, meaning that ``.get_buffers()["data"]`` always returns a buffer owning the whole physical chunk and the consumer must always interpret it with zero offset (validity and offsets buffers have to be interpreted respecting the column's offset value). """ if n_chunks is None or n_chunks == self.num_chunks(): return self._yield_chunks(self._chunk_slices) if n_chunks % self.num_chunks() != 0: raise RuntimeError( "The passed `n_chunks` has to be a multiple of `num_chunks`." ) if n_chunks > self.num_rows(): raise RuntimeError( "The passed `n_chunks` value is bigger than the amout of rows in the frame." ) extra_chunks = 0 to_subdivide = n_chunks // self.num_chunks() subdivided_slices = [] # The loop subdivides each chunk into `to_subdivide` chunks if possible for i in range(len(self._chunk_slices) - 1): chunk_length = self._chunk_slices[i + 1] - self._chunk_slices[i] step = chunk_length // to_subdivide if step == 0: # Bad case: we're requested to subdivide a chunk in more pieces than it has rows in it. # This means that there is a bigger chunk that we can subdivide into more pieces to get # the required amount of chunks. For now, subdividing the current chunk into maximum possible # pieces (TODO: maybe we should subdivide it into `sqrt(chunk_length)` chunks to make # this more oprimal?), writing a number of missing pieces into `extra_chunks` variable # to extract them from bigger chunks later. step = 1 extra_chunks += to_subdivide - chunk_length to_subdivide_chunk = chunk_length else: to_subdivide_chunk = to_subdivide for j in range(to_subdivide_chunk): subdivided_slices.append(self._chunk_slices[i] + step * j) subdivided_slices.append(self._chunk_slices[-1]) if extra_chunks != 0: # Making more pieces from big chunks to get the required amount of `n_chunks` for _ in range(extra_chunks): # 1. Find the biggest chunk # 2. Split it in the middle biggest_chunk_idx = np.argmax(np.diff(subdivided_slices)) new_chunk_offset = ( subdivided_slices[biggest_chunk_idx + 1] - subdivided_slices[biggest_chunk_idx] ) // 2 ErrorMessage.catch_bugs_and_request_email( failure_condition=new_chunk_offset == 0, extra_log="No more chunks to subdivide", ) subdivided_slices = np.insert( subdivided_slices, biggest_chunk_idx + 1, subdivided_slices[biggest_chunk_idx] + new_chunk_offset, ) ErrorMessage.catch_bugs_and_request_email( failure_condition=len(subdivided_slices) != n_chunks + 1, extra_log=f"Chunks were incorrectly split: {len(subdivided_slices)} != {n_chunks + 1}", ) return self._yield_chunks(subdivided_slices)
def _index_grouped(self): """ Construct an index of group IDs. Returns ------- dict A dict of {group name -> group labels} values. See Also -------- pandas.core.groupby.GroupBy.groups """ if self._index_grouped_cache is None: # Splitting level-by and column-by since we serialize them in a different ways by = None level = [] if self._level is not None: level = self._level if not isinstance(level, list): level = [level] elif isinstance(self._by, list): by = [] for o in self._by: if hashable(o) and o in self._query_compiler.get_index_names( self._axis ): level.append(o) else: by.append(o) else: by = self._by is_multi_by = self._is_multi_by or (by is not None and len(level) > 0) if hasattr(self._by, "columns") and is_multi_by: by = list(self._by.columns) if is_multi_by: # Because we are doing a collect (to_pandas) here and then groupby, we # end up using pandas implementation. Add the warning so the user is # aware. ErrorMessage.catch_bugs_and_request_email(self._axis == 1) ErrorMessage.default_to_pandas("Groupby with multiple columns") if isinstance(by, list) and all( is_label(self._df, o, self._axis) for o in by ): pandas_df = self._df._query_compiler.getitem_column_array( by ).to_pandas() else: by = try_cast_to_pandas(by, squeeze=True) pandas_df = self._df._to_pandas() by = wrap_into_list(by, level) self._index_grouped_cache = pandas_df.groupby(by=by).groups else: if isinstance(self._by, type(self._query_compiler)): by = self._by.to_pandas().squeeze().values elif self._by is None: index = self._query_compiler.get_axis(self._axis) levels_to_drop = [ i for i, name in enumerate(index.names) if name not in level and i not in level ] by = index.droplevel(levels_to_drop) if isinstance(by, pandas.MultiIndex): by = by.reorder_levels(level) else: by = self._by if self._axis == 0: self._index_grouped_cache = self._index.groupby(by) else: self._index_grouped_cache = self._columns.groupby(by) return self._index_grouped_cache
def _compute_index_grouped(self, numerical=False): """ Construct an index of group IDs. Parameters ---------- numerical : bool, default: False Whether a group indices should be positional (True) or label-based (False). Returns ------- dict A dict of {group name -> group indices} values. See Also -------- pandas.core.groupby.GroupBy.groups """ # We end up using pure pandas to compute group indices, so raising a warning ErrorMessage.default_to_pandas("Group indices computation") # Splitting level-by and column-by since we serialize them in a different ways by = None level = [] if self._level is not None: level = self._level if not isinstance(level, list): level = [level] elif isinstance(self._by, list): by = [] for o in self._by: if hashable(o) and o in self._query_compiler.get_index_names( self._axis): level.append(o) else: by.append(o) else: by = self._by is_multi_by = self._is_multi_by or (by is not None and len(level) > 0) # `dropna` param is the only one that matters for the group indices result dropna = self._kwargs.get("dropna", True) if hasattr(self._by, "columns") and is_multi_by: by = list(self._by.columns) if is_multi_by: # Because we are doing a collect (to_pandas) here and then groupby, we # end up using pandas implementation. Add the warning so the user is # aware. ErrorMessage.catch_bugs_and_request_email(self._axis == 1) if isinstance(by, list) and all( is_label(self._df, o, self._axis) for o in by): pandas_df = self._df._query_compiler.getitem_column_array( by).to_pandas() else: by = try_cast_to_pandas(by, squeeze=True) pandas_df = self._df._to_pandas() by = wrap_into_list(by, level) groupby_obj = pandas_df.groupby(by=by, dropna=dropna) return groupby_obj.indices if numerical else groupby_obj.groups else: if isinstance(self._by, type(self._query_compiler)): by = self._by.to_pandas().squeeze().values elif self._by is None: index = self._query_compiler.get_axis(self._axis) levels_to_drop = [ i for i, name in enumerate(index.names) if name not in level and i not in level ] by = index.droplevel(levels_to_drop) if isinstance(by, pandas.MultiIndex): by = by.reorder_levels(level) else: by = self._by axis_labels = self._query_compiler.get_axis(self._axis) if numerical: # Since we want positional indices of the groups, we want to group # on a `RangeIndex`, not on the actual index labels axis_labels = pandas.RangeIndex(len(axis_labels)) # `pandas.Index.groupby` doesn't take any parameters except `by`. # Have to convert an Index to a Series to be able to process `dropna=False`: if dropna: return axis_labels.groupby(by) else: groupby_obj = axis_labels.to_series().groupby(by, dropna=dropna) return groupby_obj.indices if numerical else groupby_obj.groups
def _index_grouped(self): """ Implement [METHOD_NAME]. TODO: Add more details for this docstring template. Parameters ---------- What arguments does this function have. [ PARAMETER_NAME: PARAMETERS TYPES Description. ] Returns ------- What this returns (if anything) """ if self._index_grouped_cache is None: # Splitting level-by and column-by since we serialize them in a different ways by = None level = [] if self._level is not None: level = self._level if not isinstance(level, list): level = [level] elif isinstance(self._by, list): by = [] for o in self._by: if hashable(o) and o in self._query_compiler.get_index_names( self._axis ): level.append(o) else: by.append(o) else: by = self._by is_multi_by = self._is_multi_by or (by is not None and len(level) > 0) if hasattr(self._by, "columns") and is_multi_by: by = list(self._by.columns) if is_multi_by: # Because we are doing a collect (to_pandas) here and then groupby, we # end up using pandas implementation. Add the warning so the user is # aware. ErrorMessage.catch_bugs_and_request_email(self._axis == 1) ErrorMessage.default_to_pandas("Groupby with multiple columns") if isinstance(by, list) and all( is_label(self._df, o, self._axis) for o in by ): pandas_df = self._df._query_compiler.getitem_column_array( by ).to_pandas() else: by = try_cast_to_pandas(by, squeeze=True) pandas_df = self._df._to_pandas() by = wrap_into_list(by, level) self._index_grouped_cache = pandas_df.groupby(by=by).groups else: if isinstance(self._by, type(self._query_compiler)): by = self._by.to_pandas().squeeze().values elif self._by is None: index = self._query_compiler.get_axis(self._axis) levels_to_drop = [ i for i, name in enumerate(index.names) if name not in level and i not in level ] by = index.droplevel(levels_to_drop) if isinstance(by, pandas.MultiIndex): by = by.reorder_levels(level) else: by = self._by if self._axis == 0: self._index_grouped_cache = self._index.groupby(by) else: self._index_grouped_cache = self._columns.groupby(by) return self._index_grouped_cache