예제 #1
0
    def _can_downcast_to_series(self, df, arg):
        """
        This method encapsulates the logic used
        to determine whether or not the result of a loc/iloc
        operation should be "downcasted" from a DataFrame to a
        Series
        """
        from cudf.core.column import as_column

        if isinstance(df, cudf.Series):
            return False
        nrows, ncols = df.shape
        if nrows == 1:
            if type(arg[0]) is slice:
                if not is_scalar(arg[1]):
                    return False
            elif (is_list_like(arg[0]) or is_column_like(arg[0])) and (
                is_list_like(arg[1])
                or is_column_like(arg[0])
                or type(arg[1]) is slice
            ):
                return False
            else:
                if pd.api.types.is_bool_dtype(
                    as_column(arg[0]).dtype
                ) and not isinstance(arg[1], slice):
                    return True
            dtypes = df.dtypes.values.tolist()
            all_numeric = all(
                [pd.api.types.is_numeric_dtype(t) for t in dtypes]
            )
            if all_numeric:
                return True
        if ncols == 1:
            if type(arg[1]) is slice:
                return False
            if isinstance(arg[1], tuple):
                # Multiindex indexing with a slice
                if any(isinstance(v, slice) for v in arg):
                    return False
            if not (is_list_like(arg[1]) or is_column_like(arg[1])):
                return True
        return False
예제 #2
0
    def length_check(obj, name):
        err_msg = ("Length of '{name}' ({len_obj}) did not match the "
                   "length of the columns being encoded ({len_required}).")

        if is_list_like(obj):
            if len(obj) != len(columns):
                err_msg = err_msg.format(name=name,
                                         len_obj=len(obj),
                                         len_required=len(columns))
                raise ValueError(err_msg)
예제 #3
0
def _filter_stripes(filters,
                    filepath_or_buffer,
                    stripes=None,
                    skip_rows=None,
                    num_rows=None):
    # Multiple sources are passed as a list. If a single source is passed,
    # wrap it in a list for unified processing downstream.
    if not is_list_like(filepath_or_buffer):
        filepath_or_buffer = [filepath_or_buffer]

    # Prepare filters
    filters = ioutils._prepare_filters(filters)

    # Get columns relevant to filtering
    columns_in_predicate = [
        col for conjunction in filters for (col, op, val) in conjunction
    ]

    # Read and parse file-level and stripe-level statistics
    file_statistics, stripes_statistics = read_orc_statistics(
        filepath_or_buffer, columns_in_predicate)

    file_stripe_map = []
    for file_stat in file_statistics:
        # Filter using file-level statistics
        if not ioutils._apply_filters(filters, file_stat):
            continue

        # Filter using stripe-level statistics
        selected_stripes = []
        num_rows_scanned = 0
        for i, stripe_statistics in enumerate(stripes_statistics):
            num_rows_before_stripe = num_rows_scanned
            num_rows_scanned += next(iter(
                stripe_statistics.values()))["number_of_values"]
            if stripes is not None and i not in stripes:
                continue
            if skip_rows is not None and num_rows_scanned <= skip_rows:
                continue
            else:
                skip_rows = 0
            if (skip_rows is not None and num_rows is not None
                    and num_rows_before_stripe >= skip_rows + num_rows):
                continue
            if ioutils._apply_filters(filters, stripe_statistics):
                selected_stripes.append(i)

        file_stripe_map.append(selected_stripes)

    return file_stripe_map
예제 #4
0
    def isin(self, values, level=None):
        """Return a boolean array where the index values are in values.

        Compute boolean array of whether each index value is found in
        the passed set of values. The length of the returned boolean
        array matches the length of the index.

        Parameters
        ----------
        values : set, list-like, Index or Multi-Index
            Sought values.
        level : str or int, optional
            Name or position of the index level to use (if the index
            is a MultiIndex).
        Returns
        -------
        is_contained : cupy array
            CuPy array of boolean values.
        Notes
        -------
        When `level` is None, `values` can only be MultiIndex, or a
        set/list-like tuples.
        When `level` is provided, `values` can be Index or MultiIndex,
        or a set/list-like tuples.
        """
        from cudf.utils.dtypes import is_list_like

        if level is None:
            if isinstance(values, cudf.MultiIndex):
                values_idx = values
            elif (
                (
                    isinstance(
                        values,
                        (
                            cudf.Series,
                            cudf.Index,
                            cudf.DataFrame,
                            column.ColumnBase,
                        ),
                    )
                )
                or (not is_list_like(values))
                or (
                    is_list_like(values)
                    and len(values) > 0
                    and not isinstance(values[0], tuple)
                )
            ):
                raise TypeError(
                    "values need to be a Multi-Index or set/list-like tuple "
                    "squences  when `level=None`."
                )
            else:
                values_idx = cudf.MultiIndex.from_tuples(
                    values, names=self.names
                )

            res = []
            for name in self.names:
                level_idx = self.get_level_values(name)
                value_idx = values_idx.get_level_values(name)

                existence = level_idx.isin(value_idx)
                res.append(existence)

            result = res[0]
            for i in res[1:]:
                result = result & i
        else:
            level_series = self.get_level_values(level)
            result = level_series.isin(values)

        return result
예제 #5
0
파일: json.py 프로젝트: TravisHester/cudf
def read_json(
    path_or_buf,
    engine="auto",
    dtype=True,
    lines=False,
    compression="infer",
    byte_range=None,
    *args,
    **kwargs,
):
    """{docstring}"""

    if engine == "cudf" and not lines:
        raise ValueError("cudf engine only supports JSON Lines format")
    if engine == "auto":
        engine = "cudf" if lines else "pandas"
    if engine == "cudf":
        # Multiple sources are passed as a list. If a single source is passed,
        # wrap it in a list for unified processing downstream.
        if not is_list_like(path_or_buf):
            path_or_buf = [path_or_buf]

        filepaths_or_buffers = []
        for source in path_or_buf:
            if ioutils.is_directory(source, **kwargs):
                fs = ioutils._ensure_filesystem(passed_filesystem=None,
                                                path=source)
                source = ioutils.stringify_pathlike(source)
                source = fs.sep.join([source, "*.json"])

            tmp_source, compression = ioutils.get_filepath_or_buffer(
                path_or_data=source,
                compression=compression,
                iotypes=(BytesIO, StringIO),
                **kwargs,
            )
            if isinstance(tmp_source, list):
                filepaths_or_buffers.extend(tmp_source)
            else:
                filepaths_or_buffers.append(tmp_source)

        return cudf.DataFrame._from_data(*libjson.read_json(
            filepaths_or_buffers, dtype, lines, compression, byte_range))
    else:
        warnings.warn("Using CPU via Pandas to read JSON dataset, this may "
                      "be GPU accelerated in the future")

        if not ioutils.ensure_single_filepath_or_buffer(
                path_or_data=path_or_buf,
                **kwargs,
        ):
            raise NotImplementedError(
                "`read_json` does not yet support reading "
                "multiple files via pandas")

        path_or_buf, compression = ioutils.get_filepath_or_buffer(
            path_or_data=path_or_buf,
            compression=compression,
            iotypes=(BytesIO, StringIO),
            **kwargs,
        )

        if kwargs.get("orient") == "table":
            pd_value = pd.read_json(
                path_or_buf,
                lines=lines,
                compression=compression,
                *args,
                **kwargs,
            )
        else:
            pd_value = pd.read_json(
                path_or_buf,
                lines=lines,
                dtype=dtype,
                compression=compression,
                *args,
                **kwargs,
            )
        df = cudf.from_pandas(pd_value)

    return df
예제 #6
0
def read_parquet(
    filepath_or_buffer,
    engine="cudf",
    columns=None,
    filters=None,
    row_groups=None,
    skip_rows=None,
    num_rows=None,
    strings_to_categorical=False,
    use_pandas_metadata=True,
    *args,
    **kwargs,
):
    """{docstring}"""

    # Multiple sources are passed as a list. If a single source is passed,
    # wrap it in a list for unified processing downstream.
    if not is_list_like(filepath_or_buffer):
        filepath_or_buffer = [filepath_or_buffer]

    # a list of row groups per source should be passed. make the list of
    # lists that is expected for multiple sources
    if row_groups is not None:
        if not is_list_like(row_groups):
            row_groups = [[row_groups]]
        elif not is_list_like(row_groups[0]):
            row_groups = [row_groups]

    filepaths_or_buffers = []
    for source in filepath_or_buffer:
        tmp_source, compression = ioutils.get_filepath_or_buffer(
            path_or_data=source, compression=None, **kwargs)
        if compression is not None:
            raise ValueError(
                "URL content-encoding decompression is not supported")
        filepaths_or_buffers.append(tmp_source)

    if filters is not None:
        # Convert filters to ds.Expression
        filters = pq._filters_to_expression(filters)

        # Initialize ds.FilesystemDataset
        dataset = ds.dataset(filepaths_or_buffers,
                             format="parquet",
                             partitioning="hive")

        # Load IDs of filtered row groups for each file in dataset
        filtered_rg_ids = defaultdict(list)
        for fragment in dataset.get_fragments(filter=filters):
            for rg_fragment in fragment.get_row_group_fragments(filters):
                for rg_id in rg_fragment.row_groups:
                    filtered_rg_ids[rg_fragment.path].append(rg_id)

        # TODO: Use this with pyarrow 1.0.0
        # # Load IDs of filtered row groups for each file in dataset
        # filtered_row_group_ids = {}
        # for fragment in dataset.get_fragments(filters):
        #     for row_group_fragment in fragment.split_by_row_group(filters):
        #         for row_group_info in row_group_fragment.row_groups:
        #             path = row_group_fragment.path
        #             if path not in filtered_row_group_ids:
        #                 filtered_row_group_ids[path] = [row_group_info.id]
        #             else:
        #                 filtered_row_group_ids[path].append(row_group_info.id)

        # Initialize row_groups to be selected
        if row_groups is None:
            row_groups = [None for _ in dataset.files]

        # Store IDs of selected row groups for each file
        for i, file in enumerate(dataset.files):
            if row_groups[i] is None:
                row_groups[i] = filtered_rg_ids[file]
            else:
                row_groups[i] = filter(lambda id: id in row_groups[i],
                                       filtered_rg_ids[file])

    if engine == "cudf":
        return libparquet.read_parquet(
            filepaths_or_buffers,
            columns=columns,
            row_groups=row_groups,
            skip_rows=skip_rows,
            num_rows=num_rows,
            strings_to_categorical=strings_to_categorical,
            use_pandas_metadata=use_pandas_metadata,
        )
    else:
        warnings.warn("Using CPU via PyArrow to read Parquet dataset.")
        return cudf.DataFrame.from_arrow(
            pq.ParquetDataset(filepaths_or_buffers).read_pandas(
                columns=columns, *args, **kwargs))
예제 #7
0
파일: string.py 프로젝트: sriramch/cudf
    def cat(self, others=None, sep=None, na_rep=None):
        """
        Concatenate strings in the Series/Index with given separator.

        If *others* is specified, this function concatenates the Series/Index
        and elements of others element-wise. If others is not passed, then all
        values in the Series/Index are concatenated into a single string with
        a given sep.

        Parameters
        ----------
            others : Series or List of str
                Strings to be appended.
                The number of strings must match size() of this instance.
                This must be either a Series of string dtype or a Python
                list of strings.

            sep : str
                If specified, this separator will be appended to each string
                before appending the others.

            na_rep : str
                This character will take the place of any null strings
                (not empty strings) in either list.

                - If `na_rep` is None, and `others` is None, missing values in
                the Series/Index are omitted from the result.
                - If `na_rep` is None, and `others` is not None, a row
                containing a missing value in any of the columns (before
                concatenation) will have a missing value in the result.

        Returns
        -------
        concat : str or Series/Index of str dtype
            If `others` is None, `str` is returned, otherwise a `Series/Index`
            (same type as caller) of str dtype is returned.
        """
        from cudf.core import Series, Index

        if isinstance(others, Series):
            assert others.dtype == np.dtype("object")
            others = others._column.nvstrings
        elif isinstance(others, Index):
            assert others.dtype == np.dtype("object")
            others = others.as_column().nvstrings
        elif isinstance(others, StringMethods):
            """
            If others is a StringMethods then
            raise an exception
            """
            msg = "series.str is an accessor, not an array-like of strings."
            raise ValueError(msg)
        elif is_list_like(others) and others:
            """
            If others is a list-like object (in our case lists & tuples)
            just another Series/Index, great go ahead with concatenation.
            """

            """
            Picking first element and checking if it really adheres to
            list like conditions, if not we switch to next case

            Note: We have made a call not to iterate over the entire list as
            it could be more expensive if it was of very large size.
            Thus only doing a sanity check on just the first element of list.
            """
            first = others[0]

            if is_list_like(first) or isinstance(
                first, (Series, Index, pd.Series, pd.Index)
            ):
                """
                Internal elements in others list should also be
                list-like and not a regular string/byte
                """
                first = None
                for frame in others:
                    if not isinstance(frame, Series):
                        """
                        Make sure all inputs to .cat function call
                        are of type nvstrings so creating a Series object.
                        """
                        frame = Series(frame, dtype="str")

                    if first is None:
                        """
                        extracting nvstrings pointer since
                        `frame` is of type Series/Index and
                        first isn't yet initialized.
                        """
                        first = frame._column.nvstrings
                    else:
                        assert frame.dtype == np.dtype("object")
                        frame = frame._column.nvstrings
                        first = first.cat(frame, sep=sep, na_rep=na_rep)

                others = first
            elif not is_list_like(first):
                """
                Picking first element and checking if it really adheres to
                non-list like conditions.

                Note: We have made a call not to iterate over the entire
                list as it could be more expensive if it was of very
                large size. Thus only doing a sanity check on just the
                first element of list.
                """
                others = Series(others)
                others = others._column.nvstrings
        elif isinstance(others, (pd.Series, pd.Index)):
            others = Series(others)
            others = others._column.nvstrings

        data = self._parent.nvstrings.cat(
            others=others, sep=sep, na_rep=na_rep
        )
        out = Series(data, index=self._index, name=self._name)
        if len(out) == 1 and others is None:
            out = out[0]
        return out
예제 #8
0
def read_orc(
    filepath_or_buffer,
    engine="cudf",
    columns=None,
    filters=None,
    stripes=None,
    skiprows=None,
    num_rows=None,
    use_index=True,
    decimal_cols_as_float=None,
    timestamp_type=None,
    **kwargs,
):
    """{docstring}"""

    from cudf import DataFrame

    # Multiple sources are passed as a list. If a single source is passed,
    # wrap it in a list for unified processing downstream.
    if not is_list_like(filepath_or_buffer):
        filepath_or_buffer = [filepath_or_buffer]

    # Each source must have a correlating stripe list. If a single stripe list
    # is provided rather than a list of list of stripes then extrapolate that
    # stripe list across all input sources
    if stripes is not None:
        if any(not isinstance(stripe, list) for stripe in stripes):
            stripes = [stripes]

        # Must ensure a stripe for each source is specified, unless None
        if not len(stripes) == len(filepath_or_buffer):
            raise ValueError(
                "A list of stripes must be provided for each input source")

    filepaths_or_buffers = []
    for source in filepath_or_buffer:
        if ioutils.is_directory(source, **kwargs):
            fs = ioutils._ensure_filesystem(passed_filesystem=None,
                                            path=source)
            source = stringify_path(source)
            source = fs.sep.join([source, "*.orc"])

        tmp_source, compression = ioutils.get_filepath_or_buffer(
            path_or_data=source,
            compression=None,
            **kwargs,
        )
        if compression is not None:
            raise ValueError(
                "URL content-encoding decompression is not supported")
        if isinstance(tmp_source, list):
            filepaths_or_buffers.extend(tmp_source)
        else:
            filepaths_or_buffers.append(tmp_source)

    if filters is not None:
        selected_stripes = _filter_stripes(filters, filepaths_or_buffers,
                                           stripes, skiprows, num_rows)

        # Return empty if everything was filtered
        if len(selected_stripes) == 0:
            return _make_empty_df(filepaths_or_buffers[0], columns)
        else:
            stripes = selected_stripes

    if engine == "cudf":
        return DataFrame._from_data(*liborc.read_orc(
            filepaths_or_buffers,
            columns,
            stripes,
            skiprows,
            num_rows,
            use_index,
            decimal_cols_as_float,
            timestamp_type,
        ))
    else:

        def read_orc_stripe(orc_file, stripe, columns):
            pa_table = orc_file.read_stripe(stripe, columns)
            if isinstance(pa_table, pa.RecordBatch):
                pa_table = pa.Table.from_batches([pa_table])
            return pa_table

        warnings.warn("Using CPU via PyArrow to read ORC dataset.")
        if len(filepath_or_buffer) > 1:
            raise NotImplementedError(
                "Using CPU via PyArrow only supports a single a "
                "single input source")

        orc_file = orc.ORCFile(filepath_or_buffer[0])
        if stripes is not None and len(stripes) > 0:
            for stripe_source_file in stripes:
                pa_tables = [
                    read_orc_stripe(orc_file, i, columns)
                    for i in stripe_source_file
                ]
                pa_table = pa.concat_tables(pa_tables)
        else:
            pa_table = orc_file.read(columns=columns)
        df = cudf.DataFrame.from_arrow(pa_table)

    return df
예제 #9
0
def cut(
    x,
    bins,
    right: bool = True,
    labels=None,
    retbins: bool = False,
    precision: int = 3,
    include_lowest: bool = False,
    duplicates: str = "raise",
    ordered: bool = True,
):
    """
    Bin values into discrete intervals.
    Use cut when you need to segment and sort data values into bins. This
    function is also useful for going from a continuous variable to a
    categorical variable.
    Parameters
    ----------
    x : array-like
        The input array to be binned. Must be 1-dimensional.
    bins : int, sequence of scalars, or IntervalIndex
        The criteria to bin by.
        * int : Defines the number of equal-width bins in the
        range of x. The range of x is extended by .1% on each
        side to include the minimum and maximum values of x.
    right : bool, default True
        Indicates whether bins includes the rightmost edge or not.
    labels : array or False, default None
        Specifies the labels for the returned bins. Must be the same
        length as the resulting bins. If False, returns only integer
        indicators of thebins. If True,raises an error. When ordered=False,
        labels must be provided.
    retbins : bool, default False
        Whether to return the bins or not.
    precision : int, default 3
        The precision at which to store and display the bins labels.
    include_lowest : bool, default False
        Whether the first interval should be left-inclusive or not.
    duplicates : {default 'raise', 'drop'}, optional
        If bin edges are not unique, raise ValueError or drop non-uniques.
    ordered : bool, default True
        Whether the labels are ordered or not. Applies to returned types
        Categorical and Series (with Categorical dtype). If True,
        the resulting categorical will be ordered. If False, the resulting
        categorical will be unordered (labels must be provided).
    Returns
    -------
    out : CategoricalIndex
        An array-like object representing the respective bin for each value
        of x. The type depends on the value of labels.
    bins : numpy.ndarray or IntervalIndex.
        The computed or specified bins. Only returned when retbins=True.
        For scalar or sequence bins, this is an ndarray with the computed
        bins. If set duplicates=drop, bins will drop non-unique bin. For
        an IntervalIndex bins, this is equal to bins.
    Examples
    --------
    Discretize into three equal-sized bins.
    >>> cudf.cut(np.array([1, 7, 5, 4, 6, 3]), 3)
    CategoricalIndex([(0.994, 3.0], (5.0, 7.0], (3.0, 5.0], (3.0, 5.0],
    ...         (5.0, 7.0],(0.994, 3.0]], categories=[(0.994, 3.0],
    ...         (3.0, 5.0], (5.0, 7.0]], ordered=True, dtype='category')
    >>> cudf.cut(np.array([1, 7, 5, 4, 6, 3]), 3, retbins=True)
    (CategoricalIndex([(0.994, 3.0], (5.0, 7.0], (3.0, 5.0], (3.0, 5.0],
    ...         (5.0, 7.0],(0.994, 3.0]],categories=[(0.994, 3.0],
    ...         (3.0, 5.0], (5.0, 7.0]],ordered=True, dtype='category'),
    array([0.994, 3.   , 5.   , 7.   ]))
    >>> cudf.cut(np.array([1, 7, 5, 4, 6, 3]),
    ...        3, labels=["bad", "medium", "good"])
    CategoricalIndex(['bad', 'good', 'medium', 'medium', 'good', 'bad'],
    ...       categories=['bad', 'medium', 'good'],ordered=True,
    ...       dtype='category')
    >>> cudf.cut(np.array([1, 7, 5, 4, 6, 3]), 3,
    ...       labels=["B", "A", "B"], ordered=False)
    CategoricalIndex(['B', 'B', 'A', 'A', 'B', 'B'], categories=['A', 'B'],
    ...        ordered=False, dtype='category')
    >>> cudf.cut([0, 1, 1, 2], bins=4, labels=False)
    array([0, 1, 1, 3], dtype=int32)
    Passing a Series as an input returns a Series with categorical dtype:
    >>> s = cudf.Series(np.array([2, 4, 6, 8, 10]),
    ...        index=['a', 'b', 'c', 'd', 'e'])
    >>> cudf.cut(s, 3)
    """
    left_inclusive = False
    right_inclusive = True
    # saving the original input x for use in case its a series
    orig_x = x
    old_bins = bins

    if not ordered and labels is None:
        raise ValueError("'labels' must be provided if 'ordered = False'")

    if duplicates not in ["raise", "drop"]:
        raise ValueError(
            "invalid value for 'duplicates' parameter, valid options are: "
            "raise, drop")

    if labels is not False:
        if not (labels is None or is_list_like(labels)):
            raise ValueError(
                "Bin labels must either be False, None or passed in as a "
                "list-like argument")
        elif ordered and labels is not None:
            if len(set(labels)) != len(labels):
                raise ValueError("labels must be unique if ordered=True;"
                                 "pass ordered=False for duplicate labels")

    # bins can either be an int, sequence of scalars or an intervalIndex
    if isinstance(bins, Sequence):
        if len(set(bins)) is not len(bins):
            if duplicates == "raise":
                raise ValueError(
                    f"Bin edges must be unique: {repr(bins)}.\n"
                    f"You can drop duplicate edges by setting the 'duplicates'"
                    "kwarg")
            elif duplicates == "drop":
                # get unique values but maintain list dtype
                bins = list(dict.fromkeys(bins))

    # if bins is an intervalIndex we ignore the value of right
    elif isinstance(bins, (pd.IntervalIndex, cudf.IntervalIndex)):
        right = bins.closed == "right"

    # create bins if given an int or single scalar
    if not isinstance(bins, pd.IntervalIndex):
        if not isinstance(bins, (Sequence)):
            if isinstance(x,
                          (pd.Series, cudf.Series, np.ndarray, cupy.ndarray)):
                mn = x.min()
                mx = x.max()
            else:
                mn = min(x)
                mx = max(x)
            bins = np.linspace(mn, mx, bins + 1, endpoint=True)
            adj = (mx - mn) * 0.001
            if right:
                bins[0] -= adj
            else:
                bins[-1] += adj

        # if right and include lowest we adjust the first
        # bin edge to make sure it is included
        if right and include_lowest:
            bins[0] = bins[0] - 10**(-precision)

        # if right is false the last bin edge is not included
        if not right:
            right_edge = bins[-1]
            x = cupy.asarray(x)
            x[x == right_edge] = right_edge + 1

        # adjust bin edges decimal precision
        int_label_bins = np.around(bins, precision)

    # the inputs is a column of the values in the array x
    input_arr = as_column(x)

    # checking for the correct inclusivity values
    if right:
        closed = "right"
    else:
        closed = "left"
        left_inclusive = True

    if isinstance(bins, pd.IntervalIndex):
        interval_labels = bins
    elif labels is None:
        if duplicates == "drop" and len(bins) == 1 and len(old_bins) != 1:
            if right and include_lowest:
                old_bins[0] = old_bins[0] - 10**(-precision)
                interval_labels = interval_range(old_bins[0],
                                                 old_bins[1],
                                                 periods=1,
                                                 closed=closed)
            else:
                interval_labels = IntervalIndex.from_breaks(old_bins,
                                                            closed=closed)
        else:
            # get labels for categories
            interval_labels = IntervalIndex.from_breaks(int_label_bins,
                                                        closed=closed)
    elif labels is not False:
        if not (is_list_like(labels)):
            raise ValueError(
                "Bin labels must either be False, None or passed in as a "
                "list-like argument")
        if ordered and len(set(labels)) != len(labels):
            raise ValueError(
                "labels must be unique if ordered=True; pass ordered=False for"
                "duplicate labels")
        else:
            if len(labels) != len(bins) - 1:
                raise ValueError(
                    "Bin labels must be one fewer than the number of bin edges"
                )
            if not ordered and len(set(labels)) != len(labels):
                interval_labels = cudf.CategoricalIndex(labels,
                                                        categories=None,
                                                        ordered=False)
            else:
                interval_labels = (labels if len(set(labels)) == len(labels)
                                   else None)

    if isinstance(bins, pd.IntervalIndex):
        # get the left and right edges of the bins as columns
        # we cannot typecast an IntervalIndex, so we need to
        # make the edges the same type as the input array
        left_edges = as_column(bins.left).astype(input_arr.dtype)
        right_edges = as_column(bins.right).astype(input_arr.dtype)
    else:
        # get the left and right edges of the bins as columns
        left_edges = as_column(bins[:-1:], dtype="float64")
        right_edges = as_column(bins[+1::], dtype="float64")
        # the input arr must be changed to the same type as the edges
        input_arr = input_arr.astype(left_edges.dtype)
    # get the indexes for the appropriate number
    index_labels = cudf._lib.labeling.label_bins(input_arr, left_edges,
                                                 left_inclusive, right_edges,
                                                 right_inclusive)

    if labels is False:
        # if labels is false we return the index labels, we return them
        # as a series if we have a series input
        if isinstance(orig_x, (pd.Series, cudf.Series)):
            # need to run more tests but looks like in this case pandas
            # always returns a float64 dtype
            indx_arr_series = cudf.Series(index_labels, dtype="float64")
            # if retbins we return the bins as well
            if retbins:
                return indx_arr_series, bins
            else:
                return indx_arr_series
        elif retbins:
            return index_labels.values, bins
        else:
            return index_labels.values

    if labels is not None:
        if labels is not ordered and len(set(labels)) != len(labels):
            # when we have duplicate labels and ordered is False, we
            # should allow duplicate categories. The categories are
            # returned in order
            new_data = [interval_labels[i][0] for i in index_labels.values]
            return cudf.CategoricalIndex(new_data,
                                         categories=sorted(set(labels)),
                                         ordered=False)

    col = build_categorical_column(
        categories=interval_labels,
        codes=index_labels,
        mask=index_labels.base_mask,
        offset=index_labels.offset,
        size=index_labels.size,
        ordered=ordered,
    )

    # we return a categorical index, as we don't have a Categorical method
    categorical_index = cudf.core.index.as_index(col)

    if isinstance(orig_x, (pd.Series, cudf.Series)):
        # if we have a series input we return a series output
        res_series = cudf.Series(categorical_index, index=orig_x.index)
        if retbins:
            return res_series, bins
        else:
            return res_series
    elif retbins:
        # if retbins is true we return the bins as well
        return categorical_index, bins
    else:
        return categorical_index