示例#1
0
    def _handle_boolean_masking(self, row_loc, col_loc):
        """
        Retrieve dataset according to the boolean mask for rows and an indexer for columns.

        In comparison with the regular ``loc/iloc.__getitem__`` flow this method efficiently
        masks rows with a Modin Series boolean mask without materializing it (if the selected
        execution implements such masking).

        Parameters
        ----------
        row_loc : modin.pandas.Series of bool dtype
            Boolean mask to index rows with.
        col_loc : object
            An indexer along column axis.

        Returns
        -------
        modin.pandas.DataFrame or modin.pandas.Series
            Located dataset.
        """
        ErrorMessage.catch_bugs_and_request_email(
            failure_condition=not isinstance(row_loc, Series),
            extra_log=f"Only ``modin.pandas.Series`` boolean masks are acceptable, got: {type(row_loc)}",
        )
        masked_df = self.df.__constructor__(
            query_compiler=self.qc.getitem_array(row_loc._query_compiler)
        )
        # Passing `slice(None)` as a row indexer since we've just applied it
        return type(self)(masked_df)[(slice(None), col_loc)]
示例#2
0
    def _get_blocks_containing_index(self, axis, index):
        """Convert a global index to a block index and local index.

        Note: This method is primarily used to convert a global index into a
            partition index (along the axis provided) and local index (useful
            for `iloc` or similar operations.

        Args:
            axis: The axis along which to get the indices
                (0 - columns, 1 - rows)
            index: The global index to convert.

        Returns:
            A tuple containing (block index and internal index).
        """
        if not axis:
            ErrorMessage.catch_bugs_and_request_email(
                index > sum(self.block_widths))
            cumulative_column_widths = np.array(self.block_widths).cumsum()
            block_idx = int(np.digitize(index, cumulative_column_widths))
            if block_idx == len(cumulative_column_widths):
                block_idx -= 1
            # Compute the internal index based on the previous lengths. This
            # is a global index, so we must subtract the lengths first.
            internal_idx = (index if not block_idx else index -
                            cumulative_column_widths[block_idx - 1])
        else:
            ErrorMessage.catch_bugs_and_request_email(
                index > sum(self.block_lengths))
            cumulative_row_lengths = np.array(self.block_lengths).cumsum()
            block_idx = int(np.digitize(index, cumulative_row_lengths))
            # See note above about internal index
            internal_idx = (index if not block_idx else index -
                            cumulative_row_lengths[block_idx - 1])
        return block_idx, internal_idx
    def get_indices(cls, axis, partitions, index_func=None):
        """
        This gets the internal indices stored in the partitions.

        Parameters
        ----------
            axis : 0 or 1
                This axis to extract the labels (0 - index, 1 - columns).
            partitions : NumPy array
                The array of partitions from which need to extract the labels.
            index_func : callable
                The function to be used to extract the function.

        Returns
        -------
        Index
            A Pandas Index object.

        Notes
        -----
        These are the global indices of the object. This is mostly useful
        when you have deleted rows/columns internally, but do not know
        which ones were deleted.
        """
        ErrorMessage.catch_bugs_and_request_email(not callable(index_func))
        func = cls.preprocess_func(index_func)
        if axis == 0:
            # We grab the first column of blocks and extract the indices
            new_idx = ([idx.apply(func).oid for idx in partitions.T[0]]
                       if len(partitions.T) else [])
        else:
            new_idx = ([idx.apply(func).oid
                        for idx in partitions[0]] if len(partitions) else [])
        new_idx = ray.get(new_idx)
        return new_idx[0].append(new_idx[1:]) if len(new_idx) else new_idx
示例#4
0
    def get_indices(cls, axis, partitions, index_func=None):
        """
        Get the internal indices stored in the partitions.

        Parameters
        ----------
        axis : {0, 1}
            Axis to extract the labels over.
        partitions : np.ndarray
            NumPy array with ``PandasFramePartition``-s.
        index_func : callable, default: None
            The function to be used to extract the indices.

        Returns
        -------
        pandas.Index
            A ``pandas.Index`` object.

        Notes
        -----
        These are the global indices of the object. This is mostly useful
        when you have deleted rows/columns internally, but do not know
        which ones were deleted.
        """
        ErrorMessage.catch_bugs_and_request_email(not callable(index_func))
        func = cls.preprocess_func(index_func)
        if axis == 0:
            # We grab the first column of blocks and extract the indices
            new_idx = ([idx.apply(func).oid for idx in partitions.T[0]]
                       if len(partitions.T) else [])
        else:
            new_idx = ([idx.apply(func).oid
                        for idx in partitions[0]] if len(partitions) else [])
        new_idx = ray.get(new_idx)
        return new_idx[0].append(new_idx[1:]) if len(new_idx) else new_idx
示例#5
0
    def _internal_by(self):
        """
        Get only those components of 'by' that are column labels of the source frame.

        Returns
        -------
        tuple of labels
        """
        if self._internal_by_cache is not no_default:
            return self._internal_by_cache

        internal_by = tuple()
        if self._drop:
            if is_list_like(self._by):
                internal_by = tuple(by for by in self._by
                                    if isinstance(by, str))
            else:
                ErrorMessage.catch_bugs_and_request_email(
                    failure_condition=not isinstance(self._by,
                                                     BaseQueryCompiler),
                    extra_log=
                    f"When 'drop' is True, 'by' must be either list-like or a QueryCompiler, met: {type(self._by)}.",
                )
                internal_by = tuple(self._by.columns)

        self._internal_by_cache = internal_by
        return internal_by
示例#6
0
 def _index_grouped(self):
     if self._index_grouped_cache is None:
         if hasattr(self._by, "columns") and len(self._by.columns) > 1:
             by = list(self._by.columns)
             is_multi_by = True
         else:
             by = self._by
             is_multi_by = self._is_multi_by
         if is_multi_by:
             # Because we are doing a collect (to_pandas) here and then groupby, we
             # end up using pandas implementation. Add the warning so the user is
             # aware.
             ErrorMessage.catch_bugs_and_request_email(self._axis == 1)
             ErrorMessage.default_to_pandas("Groupby with multiple columns")
             self._index_grouped_cache = {
                 k: v.index
                 for k, v in self._df._query_compiler.getitem_column_array(by)
                 .to_pandas()
                 .groupby(by=by)
             }
         else:
             if isinstance(self._by, type(self._query_compiler)):
                 by = self._by.to_pandas().squeeze()
             else:
                 by = self._by
             if self._axis == 0:
                 self._index_grouped_cache = self._index.groupby(by)
             else:
                 self._index_grouped_cache = self._columns.groupby(by)
     return self._index_grouped_cache
示例#7
0
    def get_indices(cls, axis, partitions, index_func=None):
        """This gets the internal indices stored in the partitions.

        Note: These are the global indices of the object. This is mostly useful
            when you have deleted rows/columns internally, but do not know
            which ones were deleted.

        Args:
            axis: This axis to extract the labels. (0 - index, 1 - columns).
            index_func: The function to be used to extract the function.

        Returns:
            A Pandas Index object.
        """
        ErrorMessage.catch_bugs_and_request_email(not callable(index_func))
        func = cls.preprocess_func(index_func)
        if axis == 0:
            new_idx = (
                [idx.apply(func).get() for idx in partitions.T[0]]
                if len(partitions.T)
                else []
            )
        else:
            new_idx = (
                [idx.apply(func).get() for idx in partitions[0]]
                if len(partitions)
                else []
            )
        # TODO FIX INFORMATION LEAK!!!!1!!1!!
        return new_idx[0].append(new_idx[1:]) if len(new_idx) else new_idx
示例#8
0
    def to_pandas(cls, partitions):
        """Convert this object into a Pandas DataFrame from the partitions.

        Returns:
            A Pandas DataFrame
        """
        retrieved_objects = [[obj.to_pandas() for obj in part] for part in partitions]
        if all(
            isinstance(part, pandas.Series) for row in retrieved_objects for part in row
        ):
            axis = 0
        elif all(
            isinstance(part, pandas.DataFrame)
            for row in retrieved_objects
            for part in row
        ):
            axis = 1
        else:
            ErrorMessage.catch_bugs_and_request_email(True)
        df_rows = [
            pandas.concat([part for part in row], axis=axis)
            for row in retrieved_objects
            if not all(part.empty for part in row)
        ]
        if len(df_rows) == 0:
            return pandas.DataFrame()
        else:
            return cls.concatenate(df_rows)
示例#9
0
    def to_pandas(cls, partitions):
        """
        Convert NumPy array of PandasDataframePartition to pandas DataFrame.

        Parameters
        ----------
        partitions : np.ndarray
            NumPy array of PandasDataframePartition.

        Returns
        -------
        pandas.DataFrame
            A pandas DataFrame
        """
        retrieved_objects = [[obj.to_pandas() for obj in part]
                             for part in partitions]
        if all(
                isinstance(part, pandas.Series) for row in retrieved_objects
                for part in row):
            axis = 0
        elif all(
                isinstance(part, pandas.DataFrame) for row in retrieved_objects
                for part in row):
            axis = 1
        else:
            ErrorMessage.catch_bugs_and_request_email(True)
        df_rows = [
            pandas.concat([part for part in row], axis=axis)
            for row in retrieved_objects if not all(part.empty for part in row)
        ]
        if len(df_rows) == 0:
            return pandas.DataFrame()
        else:
            return concatenate(df_rows)
示例#10
0
    def get_indices(cls, axis, partitions, index_func=None):
        """
        Get the internal indices stored in the partitions.

        Parameters
        ----------
        axis : {0, 1}
            Axis to extract the labels over.
        partitions : np.ndarray
            NumPy array with PandasDataframePartition's.
        index_func : callable, default: None
            The function to be used to extract the indices.

        Returns
        -------
        pandas.Index
            A pandas Index object.

        Notes
        -----
        These are the global indices of the object. This is mostly useful
        when you have deleted rows/columns internally, but do not know
        which ones were deleted.
        """
        ErrorMessage.catch_bugs_and_request_email(not callable(index_func))
        func = cls.preprocess_func(index_func)
        if axis == 0:
            new_idx = ([idx.apply(func) for idx in partitions.T[0]]
                       if len(partitions.T) else [])
        else:
            new_idx = ([idx.apply(func)
                        for idx in partitions[0]] if len(partitions) else [])
        new_idx = cls.get_objects_from_partitions(new_idx)
        # TODO FIX INFORMATION LEAK!!!!1!!1!!
        return new_idx[0].append(new_idx[1:]) if len(new_idx) else new_idx
示例#11
0
    def mask(self, row_indices=None, col_indices=None):
        ErrorMessage.catch_bugs_and_request_email(row_indices is None
                                                  and col_indices is None)
        if row_indices is not None:
            row_partitions_list = self._get_dict_of_block_index(1,
                                                                row_indices,
                                                                ordered=True)
        else:
            row_partitions_list = [(i, range(self.block_lengths[i]))
                                   for i in range(len(self.block_lengths))]

        if col_indices is not None:
            col_partitions_list = self._get_dict_of_block_index(0,
                                                                col_indices,
                                                                ordered=True)
        else:
            col_partitions_list = [(i, range(self.block_widths[i]))
                                   for i in range(len(self.block_widths))]
        return self.__constructor__(
            np.array([[
                self.partitions[row_idx][col_idx].mask(row_internal_indices,
                                                       col_internal_indices)
                for col_idx, col_internal_indices in col_partitions_list
                if len(col_internal_indices) > 0
            ] for row_idx, row_internal_indices in row_partitions_list
                      if len(row_internal_indices) > 0]))
示例#12
0
    def get_indices(cls, axis, partitions, index_func):
        """This gets the internal indices stored in the partitions.

        Note: These are the global indices of the object. This is mostly useful
            when you have deleted rows/columns internally, but do not know
            which ones were deleted.

        Args:
            axis: This axis to extract the labels. (0 - index, 1 - columns).
            index_func: The function to be used to extract the function.
            old_blocks: An optional previous object that this object was
                created from. This is used to compute the correct offsets.

        Returns:
            A Pandas Index object.
        """
        client = get_client()
        ErrorMessage.catch_bugs_and_request_email(not callable(index_func))
        func = cls.preprocess_func(index_func)
        if axis == 0:
            # We grab the first column of blocks and extract the indices
            new_idx = ([idx.apply(func).future for idx in partitions.T[0]]
                       if len(partitions.T) else [])
        else:
            new_idx = ([idx.apply(func).future
                        for idx in partitions[0]] if len(partitions) else [])
        new_idx = client.gather(new_idx)
        return new_idx[0].append(new_idx[1:]) if len(new_idx) else new_idx
示例#13
0
文件: data.py 项目: phdtanvir/modin
    def _apply_index_objs(self, axis=None):
        """Lazily applies the index object (Index or Columns) to the partitions.

        Args:
            axis: The axis to apply to, None applies to both axes.

        Returns:
            A new 2D array of partitions that have the index assignment added to the
            call queue.
        """
        self._filter_empties()
        if axis is None or axis == 0:
            cum_row_lengths = np.cumsum([0] + self._row_lengths)
        if axis is None or axis == 1:
            cum_col_widths = np.cumsum([0] + self._column_widths)

        if axis is None:

            def apply_idx_objs(df, idx, cols):
                df.index, df.columns = idx, cols
                return df

            self._partitions = np.array([[
                self._partitions[i][j].add_to_apply_calls(
                    apply_idx_objs,
                    idx=self.index[slice(cum_row_lengths[i],
                                         cum_row_lengths[i + 1])],
                    cols=self.columns[slice(cum_col_widths[j],
                                            cum_col_widths[j + 1])],
                ) for j in range(len(self._partitions[i]))
            ] for i in range(len(self._partitions))])
        elif axis == 0:

            def apply_idx_objs(df, idx):
                df.index = idx
                return df

            self._partitions = np.array([[
                self._partitions[i][j].add_to_apply_calls(
                    apply_idx_objs,
                    idx=self.index[slice(cum_row_lengths[i],
                                         cum_row_lengths[i + 1])],
                ) for j in range(len(self._partitions[i]))
            ] for i in range(len(self._partitions))])
        elif axis == 1:

            def apply_idx_objs(df, cols):
                df.columns = cols
                return df

            self._partitions = np.array([[
                self._partitions[i][j].add_to_apply_calls(
                    apply_idx_objs,
                    cols=self.columns[slice(cum_col_widths[j],
                                            cum_col_widths[j + 1])],
                ) for j in range(len(self._partitions[i]))
            ] for i in range(len(self._partitions))])
            ErrorMessage.catch_bugs_and_request_email(axis is not None
                                                      and axis not in [0, 1])
示例#14
0
    def get_indices(self, axis=0, index_func=None, old_blocks=None):
        """This gets the internal indices stored in the partitions.

        Note: These are the global indices of the object. This is mostly useful
            when you have deleted rows/columns internally, but do not know
            which ones were deleted.

        Args:
            axis: This axis to extract the labels. (0 - index, 1 - columns).
            index_func: The function to be used to extract the function.
            old_blocks: An optional previous object that this object was
                created from. This is used to compute the correct offsets.

        Returns:
            A Pandas Index object.
        """
        ErrorMessage.catch_bugs_and_request_email(not callable(index_func))
        if axis == 0:
            func = self.preprocess_func(index_func)
            # We grab the first column of blocks and extract the indices
            new_indices = [
                idx.apply(func).get() for idx in self.partitions.T[0]
            ]
            # This is important because sometimes we have resized the data. The new
            # sizes will not be valid if we are trying to compute the index on a
            # new object that has a different length.
            if old_blocks is not None:
                cumulative_block_lengths = np.array(
                    old_blocks.block_lengths).cumsum()
            else:
                cumulative_block_lengths = np.array(
                    self.block_lengths).cumsum()
        else:
            func = self.preprocess_func(index_func)
            new_indices = [idx.apply(func).get() for idx in self.partitions[0]]

            if old_blocks is not None:
                cumulative_block_lengths = np.array(
                    old_blocks.block_widths).cumsum()
            else:
                cumulative_block_lengths = np.array(self.block_widths).cumsum()
        full_indices = new_indices[0]
        if old_blocks is not None:
            for i in range(len(new_indices)):
                # If the length is 0 there is nothing to append.
                if i == 0 or len(new_indices[i]) == 0:
                    continue
                # The try-except here is intended to catch issues where we are
                # trying to get a string index out of the internal index.
                try:
                    append_val = new_indices[i] + cumulative_block_lengths[i -
                                                                           1]
                except TypeError:
                    append_val = new_indices[i]

                full_indices = full_indices.append(append_val)
        else:
            full_indices = full_indices.append(new_indices[1:])
        return full_indices
示例#15
0
    def _index_grouped(self):
        """
        Implement [METHOD_NAME].

        TODO: Add more details for this docstring template.

        Parameters
        ----------
        What arguments does this function have.
        [
        PARAMETER_NAME: PARAMETERS TYPES
            Description.
        ]

        Returns
        -------
        What this returns (if anything)
        """
        if self._index_grouped_cache is None:
            if hasattr(self._by, "columns") and len(self._by.columns) > 1:
                by = list(self._by.columns)
                is_multi_by = True
            else:
                by = self._by
                is_multi_by = self._is_multi_by
            if is_multi_by:
                # Because we are doing a collect (to_pandas) here and then groupby, we
                # end up using pandas implementation. Add the warning so the user is
                # aware.
                ErrorMessage.catch_bugs_and_request_email(self._axis == 1)
                ErrorMessage.default_to_pandas("Groupby with multiple columns")
                if isinstance(by, list) and all(
                    hashable(o)
                    and (
                        o in self._df
                        or o in self._df._query_compiler.get_index_names(self._axis)
                    )
                    for o in by
                ):
                    pandas_df = self._df._query_compiler.getitem_column_array(
                        by
                    ).to_pandas()
                else:
                    by = try_cast_to_pandas(by, squeeze=True)
                    pandas_df = self._df._to_pandas()
                self._index_grouped_cache = pandas_df.groupby(by=by).groups
            else:
                if isinstance(self._by, type(self._query_compiler)):
                    by = self._by.to_pandas().squeeze().values
                else:
                    by = self._by
                if self._axis == 0:
                    self._index_grouped_cache = self._index.groupby(by)
                else:
                    self._index_grouped_cache = self._columns.groupby(by)
        return self._index_grouped_cache
示例#16
0
    def to_numpy(self):
        """Converts Modin DataFrame to NumPy Array.

        Returns:
            NumPy Array of the QueryCompiler.
        """
        arr = self._modin_frame.to_numpy()
        ErrorMessage.catch_bugs_and_request_email(
            len(arr) != len(self.index) or len(arr[0]) != len(self.columns))
        return arr
示例#17
0
    def __getitem__(self, row_lookup, col_lookup, ndim):
        """
        Retrieve dataset according to `row_lookup` and `col_lookup`.

        Parameters
        ----------
        row_lookup : slice(None), range or np.ndarray
            The global row index to retrieve data from.
        col_lookup : slice(None), range or np.ndarray
            The global col index to retrieve data from.
        ndim : {0, 1, 2}
            Number of dimensions in dataset to be retrieved.

        Returns
        -------
        modin.pandas.DataFrame or modin.pandas.Series
            Located dataset.

        Notes
        -----
        Usage of `slice(None)` as a lookup is a hack to pass information about
        full-axis grab without computing actual indices that triggers lazy computations.
        Ideally, this API should get rid of using slices as indexers and either use a
        common ``Indexer`` object or range and ``np.ndarray`` only.
        """
        if isinstance(row_lookup, slice):
            ErrorMessage.catch_bugs_and_request_email(
                failure_condition=row_lookup != slice(None),
                extra_log=f"Only None-slices are acceptable as a slice argument in masking, got: {row_lookup}",
            )
            row_lookup = None
        if isinstance(col_lookup, slice):
            ErrorMessage.catch_bugs_and_request_email(
                failure_condition=col_lookup != slice(None),
                extra_log=f"Only None-slices are acceptable as a slice argument in masking, got: {col_lookup}",
            )
            col_lookup = None
        qc_view = self.qc.view(row_lookup, col_lookup)
        if ndim == 2:
            return self.df.__constructor__(query_compiler=qc_view)
        if isinstance(self.df, Series) and not self.row_scalar:
            return self.df.__constructor__(query_compiler=qc_view)
        if isinstance(self.df, Series):
            axis = 0
        elif ndim == 0:
            axis = None
        else:
            axis = (
                None
                if self.col_scalar and self.row_scalar
                else 1
                if self.col_scalar
                else 0
            )
        return self.df.__constructor__(query_compiler=qc_view).squeeze(axis=axis)
示例#18
0
文件: data.py 项目: yyz940922/modin
    def _apply_index_objs(self, axis=None):
        """Eagerly applies the index object (Index or Columns) to the partitions.

        Args:
            axis: The axis to apply to, None applies to both axes.

        Returns
        -------
            A new 2D array of partitions that have the index assignment added to the
            call queue.
        """
        ErrorMessage.catch_bugs_and_request_email(axis is not None
                                                  and axis not in [0, 1])

        cum_row_lengths = np.cumsum([0] + self._row_lengths)
        cum_col_widths = np.cumsum([0] + self._column_widths)

        def apply_idx_objs(df, idx, cols, axis):
            # cudf does not support set_axis. It only supports rename with 1-to-1 mapping.
            # Therefore, we need to create the dictionary that have the relationship between
            # current index and new ones.
            idx = {df.index[i]: idx[i] for i in range(len(idx))}
            cols = {df.index[i]: cols[i] for i in range(len(cols))}

            if axis == 0:
                return df.rename(index=idx)
            elif axis == 1:
                return df.rename(columns=cols)
            else:
                return df.rename(index=idx, columns=cols)

        keys = np.array([[
            self._partitions[i][j].apply(
                apply_idx_objs,
                idx=self.index[slice(cum_row_lengths[i],
                                     cum_row_lengths[i + 1])],
                cols=self.columns[slice(cum_col_widths[j],
                                        cum_col_widths[j + 1])],
                axis=axis,
            ) for j in range(len(self._partitions[i]))
        ] for i in range(len(self._partitions))])

        self._partitions = np.array([[
            cuDFOnRayFramePartition(
                self._partitions[i][j].get_gpu_manager(),
                keys[i][j],
                self._partitions[i][j]._length_cache,
                self._partitions[i][j]._width_cache,
            ) for j in range(len(keys[i]))
        ] for i in range(len(keys))])
示例#19
0
    def synchronize_labels(self, axis=None):
        """
        Synchronize labels by applying the index object (Index or Columns) to the partitions eagerly.

        Parameters
        ----------
        axis : {0, 1, None}, default: None
            The axis to apply to. If None, it applies to both axes.
        """
        ErrorMessage.catch_bugs_and_request_email(axis is not None
                                                  and axis not in [0, 1])

        cum_row_lengths = np.cumsum([0] + self._row_lengths)
        cum_col_widths = np.cumsum([0] + self._column_widths)

        def apply_idx_objs(df, idx, cols, axis):
            # cudf does not support set_axis. It only supports rename with 1-to-1 mapping.
            # Therefore, we need to create the dictionary that have the relationship between
            # current index and new ones.
            idx = {df.index[i]: idx[i] for i in range(len(idx))}
            cols = {df.index[i]: cols[i] for i in range(len(cols))}

            if axis == 0:
                return df.rename(index=idx)
            elif axis == 1:
                return df.rename(columns=cols)
            else:
                return df.rename(index=idx, columns=cols)

        keys = np.array([[
            self._partitions[i][j].apply(
                apply_idx_objs,
                idx=self.index[slice(cum_row_lengths[i],
                                     cum_row_lengths[i + 1])],
                cols=self.columns[slice(cum_col_widths[j],
                                        cum_col_widths[j + 1])],
                axis=axis,
            ) for j in range(len(self._partitions[i]))
        ] for i in range(len(self._partitions))])

        self._partitions = np.array([[
            cuDFOnRayDataframePartition(
                self._partitions[i][j].get_gpu_manager(),
                keys[i][j],
                self._partitions[i][j]._length_cache,
                self._partitions[i][j]._width_cache,
            ) for j in range(len(keys[i]))
        ] for i in range(len(keys))])
示例#20
0
    def to_pandas(self, is_transposed=False):
        """Convert this object into a Pandas DataFrame from the partitions.

        Args:
            is_transposed: A flag for telling this object that the external
                representation is transposed, but not the internal.

        Returns:
            A Pandas DataFrame
        """
        # In the case this is transposed, it is easier to just temporarily
        # transpose back then transpose after the conversion. The performance
        # is the same as if we individually transposed the blocks and
        # concatenated them, but the code is much smaller.
        if is_transposed:
            return self.transpose().to_pandas(False).T
        else:
            retrieved_objects = [
                [set_indices_for_pandas_concat(obj.to_pandas()) for obj in part]
                for part in self.partitions
            ]
            if all(
                isinstance(part, pandas.Series)
                for row in retrieved_objects
                for part in row
            ):
                axis = 0
            elif all(
                isinstance(part, pandas.DataFrame)
                for row in retrieved_objects
                for part in row
            ):
                axis = 1
            else:
                ErrorMessage.catch_bugs_and_request_email(True)
            df_rows = [
                pandas.concat([part for part in row], axis=axis)
                for row in retrieved_objects
                if not all(part.empty for part in row)
            ]
            if len(df_rows) == 0:
                return pandas.DataFrame()
            else:
                return pandas.concat(df_rows)
示例#21
0
    def __init__(
        self,
        partitions,
        index,
        columns,
        row_lengths=None,
        column_widths=None,
        dtypes=None,
    ):
        """Initialize a dataframe.

        Args:
            partitions: A 2D numpy array of partitions. Must contain partition objects.
            index: The index object for the dataframe. Converts to a pandas.Index.
            columns: The columns object for the dataframe. Converts to a pandas.Index.
            row_lengths: (optional) The lengths of each partition in the rows. The
                "height" of each of the block partitions. Is computed if not provided.
            column_widths: (optional) The width of each partition in the columns. The
                "width" of each of the block partitions. Is computed if not provided.
            dtypes: (optional) The data types for the dataframe.
        """
        self._partitions = partitions
        self._index_cache = ensure_index(index)
        self._columns_cache = ensure_index(columns)
        if row_lengths is not None and len(self.index) > 0:
            ErrorMessage.catch_bugs_and_request_email(
                sum(row_lengths) != len(self._index_cache),
                "Row lengths: {} != {}".format(
                    sum(row_lengths), len(self._index_cache)
                ),
            )
        self._row_lengths_cache = row_lengths
        if column_widths is not None and len(self.columns) > 0:
            ErrorMessage.catch_bugs_and_request_email(
                sum(column_widths) != len(self._columns_cache),
                "Column widths: {} != {}".format(
                    sum(column_widths), len(self._columns_cache)
                ),
            )
        self._column_widths_cache = column_widths
        self._dtypes = dtypes
        self._filter_empties()
示例#22
0
    def to_pandas(self):
        """Converts Modin DataFrame to Pandas DataFrame.

        Returns:
            Pandas DataFrame of the DataManager.
        """
        df = self.data.to_pandas(is_transposed=self._is_transposed)
        if df.empty:
            dtype_dict = {
                col_name: pandas.Series(dtype=self.dtypes[col_name])
                for col_name in self.columns
            }
            df = pandas.DataFrame(dtype_dict, self.index)
        else:
            ErrorMessage.catch_bugs_and_request_email(
                len(df.index) != len(self.index) or len(df.columns) != len(self.columns)
            )
            df.index = self.index
            df.columns = self.columns
        return df
示例#23
0
    def get_chunks(
        self, n_chunks: Optional[int] = None
    ) -> Iterable["OmnisciProtocolDataframe"]:
        """
        Return an iterator yielding the chunks.

        If `n_chunks` is not specified, yields the chunks that the data is stored underneath.
        If given, `n_chunks` must be a multiple of ``self.num_chunks()``, meaning that each physical
        chunk is going to be split into ``n_chunks // self.num_chunks()`` virtual chunks, that are
        backed by the same physical buffers but have different ``.offset`` values.

        Parameters
        ----------
        n_chunks : int, optional
            Number of chunks to yield.

        Returns
        -------
        Iterable["OmnisciProtocolDataframe"]
            An iterator yielding ``OmnisciProtocolDataframe`` objects.

        Raises
        ------
        ``RuntimeError`` if ``n_chunks`` is not a multiple of ``self.num_chunks()`` or ``n_chunks``
        is greater than ``self.num_rows()``.

        Notes
        -----
        There is a special casing in handling variable-sized columns (i.e. strings) when virtually chunked.
        In order to make the offsets buffer be valid for each virtual chunk, the data buffer shouldn't be
        chunked at all, meaning that ``.get_buffers()["data"]`` always returns a buffer owning the whole
        physical chunk and the consumer must always interpret it with zero offset (validity and offsets
        buffers have to be interpreted respecting the column's offset value).
        """
        if n_chunks is None or n_chunks == self.num_chunks():
            return self._yield_chunks(self._chunk_slices)

        if n_chunks % self.num_chunks() != 0:
            raise RuntimeError(
                "The passed `n_chunks` has to be a multiple of `num_chunks`."
            )

        if n_chunks > self.num_rows():
            raise RuntimeError(
                "The passed `n_chunks` value is bigger than the amout of rows in the frame."
            )

        extra_chunks = 0
        to_subdivide = n_chunks // self.num_chunks()
        subdivided_slices = []

        # The loop subdivides each chunk into `to_subdivide` chunks if possible
        for i in range(len(self._chunk_slices) - 1):
            chunk_length = self._chunk_slices[i + 1] - self._chunk_slices[i]
            step = chunk_length // to_subdivide
            if step == 0:
                # Bad case: we're requested to subdivide a chunk in more pieces than it has rows in it.
                # This means that there is a bigger chunk that we can subdivide into more pieces to get
                # the required amount of chunks. For now, subdividing the current chunk into maximum possible
                # pieces (TODO: maybe we should subdivide it into `sqrt(chunk_length)` chunks to make
                # this more oprimal?), writing a number of missing pieces into `extra_chunks` variable
                # to extract them from bigger chunks later.
                step = 1
                extra_chunks += to_subdivide - chunk_length
                to_subdivide_chunk = chunk_length
            else:
                to_subdivide_chunk = to_subdivide

            for j in range(to_subdivide_chunk):
                subdivided_slices.append(self._chunk_slices[i] + step * j)
        subdivided_slices.append(self._chunk_slices[-1])

        if extra_chunks != 0:
            # Making more pieces from big chunks to get the required amount of `n_chunks`
            for _ in range(extra_chunks):
                # 1. Find the biggest chunk
                # 2. Split it in the middle
                biggest_chunk_idx = np.argmax(np.diff(subdivided_slices))
                new_chunk_offset = (
                    subdivided_slices[biggest_chunk_idx + 1]
                    - subdivided_slices[biggest_chunk_idx]
                ) // 2
                ErrorMessage.catch_bugs_and_request_email(
                    failure_condition=new_chunk_offset == 0,
                    extra_log="No more chunks to subdivide",
                )
                subdivided_slices = np.insert(
                    subdivided_slices,
                    biggest_chunk_idx + 1,
                    subdivided_slices[biggest_chunk_idx] + new_chunk_offset,
                )

        ErrorMessage.catch_bugs_and_request_email(
            failure_condition=len(subdivided_slices) != n_chunks + 1,
            extra_log=f"Chunks were incorrectly split: {len(subdivided_slices)} != {n_chunks + 1}",
        )

        return self._yield_chunks(subdivided_slices)
示例#24
0
    def _index_grouped(self):
        """
        Construct an index of group IDs.

        Returns
        -------
        dict
            A dict of {group name -> group labels} values.

        See Also
        --------
        pandas.core.groupby.GroupBy.groups
        """
        if self._index_grouped_cache is None:
            # Splitting level-by and column-by since we serialize them in a different ways
            by = None
            level = []
            if self._level is not None:
                level = self._level
                if not isinstance(level, list):
                    level = [level]
            elif isinstance(self._by, list):
                by = []
                for o in self._by:
                    if hashable(o) and o in self._query_compiler.get_index_names(
                        self._axis
                    ):
                        level.append(o)
                    else:
                        by.append(o)
            else:
                by = self._by

            is_multi_by = self._is_multi_by or (by is not None and len(level) > 0)

            if hasattr(self._by, "columns") and is_multi_by:
                by = list(self._by.columns)

            if is_multi_by:
                # Because we are doing a collect (to_pandas) here and then groupby, we
                # end up using pandas implementation. Add the warning so the user is
                # aware.
                ErrorMessage.catch_bugs_and_request_email(self._axis == 1)
                ErrorMessage.default_to_pandas("Groupby with multiple columns")
                if isinstance(by, list) and all(
                    is_label(self._df, o, self._axis) for o in by
                ):
                    pandas_df = self._df._query_compiler.getitem_column_array(
                        by
                    ).to_pandas()
                else:
                    by = try_cast_to_pandas(by, squeeze=True)
                    pandas_df = self._df._to_pandas()
                by = wrap_into_list(by, level)
                self._index_grouped_cache = pandas_df.groupby(by=by).groups
            else:
                if isinstance(self._by, type(self._query_compiler)):
                    by = self._by.to_pandas().squeeze().values
                elif self._by is None:
                    index = self._query_compiler.get_axis(self._axis)
                    levels_to_drop = [
                        i
                        for i, name in enumerate(index.names)
                        if name not in level and i not in level
                    ]
                    by = index.droplevel(levels_to_drop)
                    if isinstance(by, pandas.MultiIndex):
                        by = by.reorder_levels(level)
                else:
                    by = self._by
                if self._axis == 0:
                    self._index_grouped_cache = self._index.groupby(by)
                else:
                    self._index_grouped_cache = self._columns.groupby(by)
        return self._index_grouped_cache
示例#25
0
    def _compute_index_grouped(self, numerical=False):
        """
        Construct an index of group IDs.

        Parameters
        ----------
        numerical : bool, default: False
            Whether a group indices should be positional (True) or label-based (False).

        Returns
        -------
        dict
            A dict of {group name -> group indices} values.

        See Also
        --------
        pandas.core.groupby.GroupBy.groups
        """
        # We end up using pure pandas to compute group indices, so raising a warning
        ErrorMessage.default_to_pandas("Group indices computation")

        # Splitting level-by and column-by since we serialize them in a different ways
        by = None
        level = []
        if self._level is not None:
            level = self._level
            if not isinstance(level, list):
                level = [level]
        elif isinstance(self._by, list):
            by = []
            for o in self._by:
                if hashable(o) and o in self._query_compiler.get_index_names(
                        self._axis):
                    level.append(o)
                else:
                    by.append(o)
        else:
            by = self._by

        is_multi_by = self._is_multi_by or (by is not None and len(level) > 0)
        # `dropna` param is the only one that matters for the group indices result
        dropna = self._kwargs.get("dropna", True)

        if hasattr(self._by, "columns") and is_multi_by:
            by = list(self._by.columns)

        if is_multi_by:
            # Because we are doing a collect (to_pandas) here and then groupby, we
            # end up using pandas implementation. Add the warning so the user is
            # aware.
            ErrorMessage.catch_bugs_and_request_email(self._axis == 1)
            if isinstance(by, list) and all(
                    is_label(self._df, o, self._axis) for o in by):
                pandas_df = self._df._query_compiler.getitem_column_array(
                    by).to_pandas()
            else:
                by = try_cast_to_pandas(by, squeeze=True)
                pandas_df = self._df._to_pandas()
            by = wrap_into_list(by, level)
            groupby_obj = pandas_df.groupby(by=by, dropna=dropna)
            return groupby_obj.indices if numerical else groupby_obj.groups
        else:
            if isinstance(self._by, type(self._query_compiler)):
                by = self._by.to_pandas().squeeze().values
            elif self._by is None:
                index = self._query_compiler.get_axis(self._axis)
                levels_to_drop = [
                    i for i, name in enumerate(index.names)
                    if name not in level and i not in level
                ]
                by = index.droplevel(levels_to_drop)
                if isinstance(by, pandas.MultiIndex):
                    by = by.reorder_levels(level)
            else:
                by = self._by
            axis_labels = self._query_compiler.get_axis(self._axis)
            if numerical:
                # Since we want positional indices of the groups, we want to group
                # on a `RangeIndex`, not on the actual index labels
                axis_labels = pandas.RangeIndex(len(axis_labels))
            # `pandas.Index.groupby` doesn't take any parameters except `by`.
            # Have to convert an Index to a Series to be able to process `dropna=False`:
            if dropna:
                return axis_labels.groupby(by)
            else:
                groupby_obj = axis_labels.to_series().groupby(by,
                                                              dropna=dropna)
                return groupby_obj.indices if numerical else groupby_obj.groups
示例#26
0
    def _index_grouped(self):
        """
        Implement [METHOD_NAME].

        TODO: Add more details for this docstring template.

        Parameters
        ----------
        What arguments does this function have.
        [
        PARAMETER_NAME: PARAMETERS TYPES
            Description.
        ]

        Returns
        -------
        What this returns (if anything)
        """
        if self._index_grouped_cache is None:
            # Splitting level-by and column-by since we serialize them in a different ways
            by = None
            level = []
            if self._level is not None:
                level = self._level
                if not isinstance(level, list):
                    level = [level]
            elif isinstance(self._by, list):
                by = []
                for o in self._by:
                    if hashable(o) and o in self._query_compiler.get_index_names(
                        self._axis
                    ):
                        level.append(o)
                    else:
                        by.append(o)
            else:
                by = self._by

            is_multi_by = self._is_multi_by or (by is not None and len(level) > 0)

            if hasattr(self._by, "columns") and is_multi_by:
                by = list(self._by.columns)

            if is_multi_by:
                # Because we are doing a collect (to_pandas) here and then groupby, we
                # end up using pandas implementation. Add the warning so the user is
                # aware.
                ErrorMessage.catch_bugs_and_request_email(self._axis == 1)
                ErrorMessage.default_to_pandas("Groupby with multiple columns")
                if isinstance(by, list) and all(
                    is_label(self._df, o, self._axis) for o in by
                ):
                    pandas_df = self._df._query_compiler.getitem_column_array(
                        by
                    ).to_pandas()
                else:
                    by = try_cast_to_pandas(by, squeeze=True)
                    pandas_df = self._df._to_pandas()
                by = wrap_into_list(by, level)
                self._index_grouped_cache = pandas_df.groupby(by=by).groups
            else:
                if isinstance(self._by, type(self._query_compiler)):
                    by = self._by.to_pandas().squeeze().values
                elif self._by is None:
                    index = self._query_compiler.get_axis(self._axis)
                    levels_to_drop = [
                        i
                        for i, name in enumerate(index.names)
                        if name not in level and i not in level
                    ]
                    by = index.droplevel(levels_to_drop)
                    if isinstance(by, pandas.MultiIndex):
                        by = by.reorder_levels(level)
                else:
                    by = self._by
                if self._axis == 0:
                    self._index_grouped_cache = self._index.groupby(by)
                else:
                    self._index_grouped_cache = self._columns.groupby(by)
        return self._index_grouped_cache