Exemplo n.º 1
0
    def test_merge(self):
        avals = randn(2, 10)
        bvals = randn(2, 10)

        ref_cols = Index(['e', 'a', 'b', 'd', 'f'])

        ablock = make_block(avals,
                            ref_cols.get_indexer(['e', 'b']))
        bblock = make_block(bvals,
                            ref_cols.get_indexer(['a', 'd']))
        merged = ablock.merge(bblock)
        assert_almost_equal(merged.mgr_locs, [0, 1, 2, 3])
        assert_almost_equal(merged.values[[0, 2]], avals)
        assert_almost_equal(merged.values[[1, 3]], bvals)
Exemplo n.º 2
0
def test_get_indexer():
    major_axis = Index(lrange(4))
    minor_axis = Index(lrange(2))

    major_labels = np.array([0, 0, 1, 2, 2, 3, 3], dtype=np.intp)
    minor_labels = np.array([0, 1, 0, 0, 1, 0, 1], dtype=np.intp)

    index = MultiIndex(levels=[major_axis, minor_axis],
                       labels=[major_labels, minor_labels])
    idx1 = index[:5]
    idx2 = index[[1, 3, 5]]

    r1 = idx1.get_indexer(idx2)
    assert_almost_equal(r1, np.array([1, 3, -1], dtype=np.intp))

    r1 = idx2.get_indexer(idx1, method='pad')
    e1 = np.array([-1, 0, 0, 1, 1], dtype=np.intp)
    assert_almost_equal(r1, e1)

    r2 = idx2.get_indexer(idx1[::-1], method='pad')
    assert_almost_equal(r2, e1[::-1])

    rffill1 = idx2.get_indexer(idx1, method='ffill')
    assert_almost_equal(r1, rffill1)

    r1 = idx2.get_indexer(idx1, method='backfill')
    e1 = np.array([0, 0, 1, 1, 2], dtype=np.intp)
    assert_almost_equal(r1, e1)

    r2 = idx2.get_indexer(idx1[::-1], method='backfill')
    assert_almost_equal(r2, e1[::-1])

    rbfill1 = idx2.get_indexer(idx1, method='bfill')
    assert_almost_equal(r1, rbfill1)

    # pass non-MultiIndex
    r1 = idx1.get_indexer(idx2.values)
    rexp1 = idx1.get_indexer(idx2)
    assert_almost_equal(r1, rexp1)

    r1 = idx1.get_indexer([1, 2, 3])
    assert (r1 == [-1, -1, -1]).all()

    # create index with duplicates
    idx1 = Index(lrange(10) + lrange(10))
    idx2 = Index(lrange(20))

    msg = "Reindexing only valid with uniquely valued Index objects"
    with tm.assert_raises_regex(InvalidIndexError, msg):
        idx1.get_indexer(idx2)
Exemplo n.º 3
0
    def test_merge(self):
        avals = randn(2, 10)
        bvals = randn(2, 10)

        ref_cols = Index(["e", "a", "b", "d", "f"])

        ablock = make_block(avals, ref_cols.get_indexer(["e", "b"]))
        bblock = make_block(bvals, ref_cols.get_indexer(["a", "d"]))
        merged = ablock.merge(bblock)
        assert_almost_equal(merged.mgr_locs, [0, 1, 2, 3])
        assert_almost_equal(merged.values[[0, 2]], avals)
        assert_almost_equal(merged.values[[1, 3]], bvals)
Exemplo n.º 4
0
def union_categoricals(to_union):
    """
    Combine list-like of Categoricals, unioning categories. All
    must have the same dtype, and none can be ordered.

    .. versionadded:: 0.19.0

    Parameters
    ----------
    to_union : list-like of Categoricals

    Returns
    -------
    Categorical
       A single array, categories will be ordered as they
       appear in the list

    Raises
    ------
    TypeError
        If any of the categoricals are ordered or all do not
        have the same dtype
    ValueError
        Emmpty list of categoricals passed
    """
    from pandas import Index, Categorical

    if len(to_union) == 0:
        raise ValueError("No Categoricals to union")

    first = to_union[0]
    if any(c.ordered for c in to_union):
        raise TypeError("Can only combine unordered Categoricals")

    if not all(is_dtype_equal(c.categories.dtype, first.categories.dtype) for c in to_union):
        raise TypeError("dtype of categories must be the same")

    cats = first.categories
    unique_cats = cats.append([c.categories for c in to_union[1:]]).unique()
    categories = Index(unique_cats)

    new_codes = []
    for c in to_union:
        if len(c.categories) > 0:
            indexer = categories.get_indexer(c.categories)
            new_codes.append(take_1d(indexer, c.codes, fill_value=-1))
        else:
            # must be all NaN
            new_codes.append(c.codes)

    new_codes = np.concatenate(new_codes)
    return Categorical(new_codes, categories=categories, ordered=False, fastpath=True)
Exemplo n.º 5
0
    def test_merge(self):
        avals = randn(2, 10)
        bvals = randn(2, 10)

        ref_cols = Index(['e', 'a', 'b', 'd', 'f'])

        ablock = make_block(avals, ref_cols.get_indexer(['e', 'b']))
        bblock = make_block(bvals, ref_cols.get_indexer(['a', 'd']))
        merged = ablock.merge(bblock)
        tm.assert_numpy_array_equal(merged.mgr_locs.as_array,
                                    np.array([0, 1, 2, 3], dtype=np.int64))
        tm.assert_numpy_array_equal(merged.values[[0, 2]], np.array(avals))
        tm.assert_numpy_array_equal(merged.values[[1, 3]], np.array(bvals))
Exemplo n.º 6
0
    def test_get_indexer_strings_raises(self):
        index = Index(["b", "c"])

        msg = r"unsupported operand type\(s\) for -: 'str' and 'str'"
        with pytest.raises(TypeError, match=msg):
            index.get_indexer(["a", "b", "c", "d"], method="nearest")

        with pytest.raises(TypeError, match=msg):
            index.get_indexer(["a", "b", "c", "d"], method="pad", tolerance=2)

        with pytest.raises(TypeError, match=msg):
            index.get_indexer(["a", "b", "c", "d"],
                              method="pad",
                              tolerance=[2, 2, 2, 2])
Exemplo n.º 7
0
def _gen_query_anndata(
    data: Union[MultimodalData, UnimodalData],
    ref_features: pd.Index,
    obs_columns: Optional[List[str]] = None,
    matkey: str = "counts",
) -> anndata.AnnData:
    """ Generate a new query Anndata object for scvitools

    Parameters
    ----------
    data: ``pegasusio.MultimodalData``
        Annotated data matrix with rows for cells and columns for genes.
    ref_features: ``pd.Index``
        A pandas index of reference feature names
    obs_columns: ``List[str]``
        A list of obs keys that should be included in the new anndata.
    matkey: ``str``, optional, default: ``"counts"``
        Matrix key for the raw count

    Returns
    -------
    An AnnData object.
    """
    mat = data.get_matrix(matkey)
    if obs_columns is not None and obs_columns:
        obs_field = data.obs[obs_columns]
    else:
        obs_field = data.obs
    var_field = pd.DataFrame(index=ref_features)

    indexer = ref_features.get_indexer(data.var_names)
    new_size = (indexer[mat.indices] >= 0).sum()
    data_new, indices_new, indptr_new = _select_csr(mat.data, mat.indices,
                                                    mat.indptr, indexer,
                                                    new_size)
    X = csr_matrix((data_new, indices_new, indptr_new),
                   shape=(mat.shape[0], ref_features.size))
    X.sort_indices()

    return anndata.AnnData(X=X, obs=obs_field, var=var_field)
Exemplo n.º 8
0
def union_categoricals(to_union, sort_categories=False, ignore_order=False):
    """
    Combine list-like of Categorical-like, unioning categories. All
    categories must have the same dtype.

    .. versionadded:: 0.19.0

    Parameters
    ----------
    to_union : list-like of Categorical, CategoricalIndex,
               or Series with dtype='category'
    sort_categories : boolean, default False
        If true, resulting categories will be lexsorted, otherwise
        they will be ordered as they appear in the data.
    ignore_order: boolean, default False
        If true, the ordered attribute of the Categoricals will be ignored.
        Results in an unordered categorical.

        .. versionadded:: 0.20.0

    Returns
    -------
    result : Categorical

    Raises
    ------
    TypeError
        - all inputs do not have the same dtype
        - all inputs do not have the same ordered property
        - all inputs are ordered and their categories are not identical
        - sort_categories=True and Categoricals are ordered
    ValueError
        Empty list of categoricals passed
    """
    from pandas import Index, Categorical, CategoricalIndex, Series

    if len(to_union) == 0:
        raise ValueError('No Categoricals to union')

    def _maybe_unwrap(x):
        if isinstance(x, (CategoricalIndex, Series)):
            return x.values
        elif isinstance(x, Categorical):
            return x
        else:
            raise TypeError("all components to combine must be Categorical")

    to_union = [_maybe_unwrap(x) for x in to_union]
    first = to_union[0]

    if not all(is_dtype_equal(other.categories.dtype, first.categories.dtype)
               for other in to_union[1:]):
        raise TypeError("dtype of categories must be the same")

    ordered = False
    if all(first.is_dtype_equal(other) for other in to_union[1:]):
        # identical categories - fastpath
        categories = first.categories
        ordered = first.ordered
        new_codes = np.concatenate([c.codes for c in to_union])

        if sort_categories and not ignore_order and ordered:
            raise TypeError("Cannot use sort_categories=True with "
                            "ordered Categoricals")

        if sort_categories and not categories.is_monotonic_increasing:
            categories = categories.sort_values()
            indexer = categories.get_indexer(first.categories)
            new_codes = take_1d(indexer, new_codes, fill_value=-1)
    elif ignore_order or all(not c.ordered for c in to_union):
        # different categories - union and recode
        cats = first.categories.append([c.categories for c in to_union[1:]])
        categories = Index(cats.unique())
        if sort_categories:
            categories = categories.sort_values()

        new_codes = []
        for c in to_union:
            if len(c.categories) > 0:
                indexer = categories.get_indexer(c.categories)
                new_codes.append(take_1d(indexer, c.codes, fill_value=-1))
            else:
                # must be all NaN
                new_codes.append(c.codes)
        new_codes = np.concatenate(new_codes)
    else:
        # ordered - to show a proper error message
        if all(c.ordered for c in to_union):
            msg = ("to union ordered Categoricals, "
                   "all categories must be the same")
            raise TypeError(msg)
        else:
            raise TypeError('Categorical.ordered must be the same')

    if ignore_order:
        ordered = False

    return Categorical(new_codes, categories=categories, ordered=ordered,
                       fastpath=True)
Exemplo n.º 9
0
    def test_get_indexer_nearest(self, method, tolerance, indexer, expected):
        index = Index(np.arange(10))

        actual = index.get_indexer(indexer, method=method, tolerance=tolerance)
        tm.assert_numpy_array_equal(actual, np.array(expected, dtype=np.intp))
Exemplo n.º 10
0
def union_categoricals(to_union, sort_categories=False, ignore_order=False):
    """
    Combine list-like of Categorical-like, unioning categories. All
    categories must have the same dtype.

    .. versionadded:: 0.19.0

    Parameters
    ----------
    to_union : list-like of Categorical, CategoricalIndex,
               or Series with dtype='category'
    sort_categories : boolean, default False
        If true, resulting categories will be lexsorted, otherwise
        they will be ordered as they appear in the data.
    ignore_order: boolean, default False
        If true, the ordered attribute of the Categoricals will be ignored.
        Results in an unordered categorical.

        .. versionadded:: 0.20.0

    Returns
    -------
    result : Categorical

    Raises
    ------
    TypeError
        - all inputs do not have the same dtype
        - all inputs do not have the same ordered property
        - all inputs are ordered and their categories are not identical
        - sort_categories=True and Categoricals are ordered
    ValueError
        Empty list of categoricals passed

    Notes
    -----

    To learn more about categories, see `link
    <http://pandas.pydata.org/pandas-docs/stable/categorical.html#unioning>`__

    Examples
    --------

    >>> from pandas.api.types import union_categoricals

    If you want to combine categoricals that do not necessarily have
    the same categories, `union_categoricals` will combine a list-like
    of categoricals. The new categories will be the union of the
    categories being combined.

    >>> a = pd.Categorical(["b", "c"])
    >>> b = pd.Categorical(["a", "b"])
    >>> union_categoricals([a, b])
    [b, c, a, b]
    Categories (3, object): [b, c, a]

    By default, the resulting categories will be ordered as they appear
    in the `categories` of the data. If you want the categories to be
    lexsorted, use `sort_categories=True` argument.

    >>> union_categoricals([a, b], sort_categories=True)
    [b, c, a, b]
    Categories (3, object): [a, b, c]

    `union_categoricals` also works with the case of combining two
    categoricals of the same categories and order information (e.g. what
    you could also `append` for).

    >>> a = pd.Categorical(["a", "b"], ordered=True)
    >>> b = pd.Categorical(["a", "b", "a"], ordered=True)
    >>> union_categoricals([a, b])
    [a, b, a, b, a]
    Categories (2, object): [a < b]

    Raises `TypeError` because the categories are ordered and not identical.

    >>> a = pd.Categorical(["a", "b"], ordered=True)
    >>> b = pd.Categorical(["a", "b", "c"], ordered=True)
    >>> union_categoricals([a, b])
    TypeError: to union ordered Categoricals, all categories must be the same

    New in version 0.20.0

    Ordered categoricals with different categories or orderings can be
    combined by using the `ignore_ordered=True` argument.

    >>> a = pd.Categorical(["a", "b", "c"], ordered=True)
    >>> b = pd.Categorical(["c", "b", "a"], ordered=True)
    >>> union_categoricals([a, b], ignore_order=True)
    [a, b, c, c, b, a]
    Categories (3, object): [a, b, c]

    `union_categoricals` also works with a `CategoricalIndex`, or `Series`
    containing categorical data, but note that the resulting array will
    always be a plain `Categorical`

    >>> a = pd.Series(["b", "c"], dtype='category')
    >>> b = pd.Series(["a", "b"], dtype='category')
    >>> union_categoricals([a, b])
    [b, c, a, b]
    Categories (3, object): [b, c, a]
    """
    from pandas import Index, Categorical, CategoricalIndex, Series

    if len(to_union) == 0:
        raise ValueError('No Categoricals to union')

    def _maybe_unwrap(x):
        if isinstance(x, (CategoricalIndex, Series)):
            return x.values
        elif isinstance(x, Categorical):
            return x
        else:
            raise TypeError("all components to combine must be Categorical")

    to_union = [_maybe_unwrap(x) for x in to_union]
    first = to_union[0]

    if not all(
            is_dtype_equal(other.categories.dtype, first.categories.dtype)
            for other in to_union[1:]):
        raise TypeError("dtype of categories must be the same")

    ordered = False
    if all(first.is_dtype_equal(other) for other in to_union[1:]):
        # identical categories - fastpath
        categories = first.categories
        ordered = first.ordered
        new_codes = np.concatenate([c.codes for c in to_union])

        if sort_categories and not ignore_order and ordered:
            raise TypeError("Cannot use sort_categories=True with "
                            "ordered Categoricals")

        if sort_categories and not categories.is_monotonic_increasing:
            categories = categories.sort_values()
            indexer = categories.get_indexer(first.categories)

            from pandas.core.algorithms import take_1d
            new_codes = take_1d(indexer, new_codes, fill_value=-1)
    elif ignore_order or all(not c.ordered for c in to_union):
        # different categories - union and recode
        cats = first.categories.append([c.categories for c in to_union[1:]])
        categories = Index(cats.unique())
        if sort_categories:
            categories = categories.sort_values()

        new_codes = []
        for c in to_union:
            if len(c.categories) > 0:
                indexer = categories.get_indexer(c.categories)

                from pandas.core.algorithms import take_1d
                new_codes.append(take_1d(indexer, c.codes, fill_value=-1))
            else:
                # must be all NaN
                new_codes.append(c.codes)
        new_codes = np.concatenate(new_codes)
    else:
        # ordered - to show a proper error message
        if all(c.ordered for c in to_union):
            msg = ("to union ordered Categoricals, "
                   "all categories must be the same")
            raise TypeError(msg)
        else:
            raise TypeError('Categorical.ordered must be the same')

    if ignore_order:
        ordered = False

    return Categorical(new_codes,
                       categories=categories,
                       ordered=ordered,
                       fastpath=True)
Exemplo n.º 11
0
def union_categoricals(to_union):
    """
    Combine list-like of Categoricals, unioning categories. All
    must have the same dtype, and none can be ordered.

    .. versionadded:: 0.19.0

    Parameters
    ----------
    to_union : list-like of Categoricals

    Returns
    -------
    Categorical
       A single array, categories will be ordered as they
       appear in the list

    Raises
    ------
    TypeError
        - all inputs do not have the same dtype
        - all inputs do not have the same ordered property
        - all inputs are ordered and their categories are not identical
    ValueError
        Emmpty list of categoricals passed
    """
    from pandas import Index, Categorical

    if len(to_union) == 0:
        raise ValueError('No Categoricals to union')

    first = to_union[0]

    if not all(
            is_dtype_equal(c.categories.dtype, first.categories.dtype)
            for c in to_union):
        raise TypeError("dtype of categories must be the same")

    if all(first.is_dtype_equal(other) for other in to_union[1:]):
        return Categorical(np.concatenate([c.codes for c in to_union]),
                           categories=first.categories,
                           ordered=first.ordered,
                           fastpath=True)
    elif all(not c.ordered for c in to_union):
        # not ordered
        pass
    else:
        # to show a proper error message
        if all(c.ordered for c in to_union):
            msg = ("to union ordered Categoricals, "
                   "all categories must be the same")
            raise TypeError(msg)
        else:
            raise TypeError('Categorical.ordered must be the same')

    cats = first.categories
    unique_cats = cats.append([c.categories for c in to_union[1:]]).unique()
    categories = Index(unique_cats)

    new_codes = []
    for c in to_union:
        if len(c.categories) > 0:
            indexer = categories.get_indexer(c.categories)
            new_codes.append(take_1d(indexer, c.codes, fill_value=-1))
        else:
            # must be all NaN
            new_codes.append(c.codes)

    new_codes = np.concatenate(new_codes)
    return Categorical(new_codes,
                       categories=categories,
                       ordered=False,
                       fastpath=True)
Exemplo n.º 12
0
def union_categoricals(to_union):
    """
    Combine list-like of Categoricals, unioning categories. All
    must have the same dtype, and none can be ordered.

    .. versionadded:: 0.19.0

    Parameters
    ----------
    to_union : list-like of Categoricals

    Returns
    -------
    Categorical
       A single array, categories will be ordered as they
       appear in the list

    Raises
    ------
    TypeError
        - all inputs do not have the same dtype
        - all inputs do not have the same ordered property
        - all inputs are ordered and their categories are not identical
    ValueError
        Emmpty list of categoricals passed
    """
    from pandas import Index, Categorical

    if len(to_union) == 0:
        raise ValueError('No Categoricals to union')

    first = to_union[0]

    if not all(is_dtype_equal(c.categories.dtype, first.categories.dtype)
               for c in to_union):
        raise TypeError("dtype of categories must be the same")

    if all(first.is_dtype_equal(other) for other in to_union[1:]):
        return Categorical(np.concatenate([c.codes for c in to_union]),
                           categories=first.categories, ordered=first.ordered,
                           fastpath=True)
    elif all(not c.ordered for c in to_union):
        # not ordered
        pass
    else:
        # to show a proper error message
        if all(c.ordered for c in to_union):
            msg = ("to union ordered Categoricals, "
                   "all categories must be the same")
            raise TypeError(msg)
        else:
            raise TypeError('Categorical.ordered must be the same')

    cats = first.categories
    unique_cats = cats.append([c.categories for c in to_union[1:]]).unique()
    categories = Index(unique_cats)

    new_codes = []
    for c in to_union:
        if len(c.categories) > 0:
            indexer = categories.get_indexer(c.categories)
            new_codes.append(take_1d(indexer, c.codes, fill_value=-1))
        else:
            # must be all NaN
            new_codes.append(c.codes)

    new_codes = np.concatenate(new_codes)
    return Categorical(new_codes, categories=categories, ordered=False,
                       fastpath=True)
Exemplo n.º 13
0
    def test_get_indexer_strings(self, method, expected):
        index = Index(["b", "c"])
        actual = index.get_indexer(["a", "b", "c", "d"], method=method)

        tm.assert_numpy_array_equal(actual, expected)
Exemplo n.º 14
0
def gen_reindexer(new_var: pd.Index, cur_var: pd.Index, *, fill_value=0):
    """
    Given a new set of var_names, and a current set, generates a function which will reindex
    a matrix to be aligned with the new set.

    Usage
    -----

    >>> a = AnnData(sparse.eye(3), var=pd.DataFrame(index=list("abc")))
    >>> b = AnnData(sparse.eye(2), var=pd.DataFrame(index=list("ba")))
    >>> reindexer = gen_reindexer(a.var_names, b.var_names)
    >>> sparse.vstack([a.X, reindexer(b.X)]).toarray()
    array([[1., 0., 0.],
           [0., 1., 0.],
           [0., 0., 1.],
           [0., 1., 0.],
           [1., 0., 0.]], dtype=float32)
    >>> reindexer_nan = gen_reindexer(a.var_names, b.var_names, fill_value=np.nan)
    >>> sparse.vstack([a.X, reindexer_nan(b.X)]).toarray()
    array([[ 1.,  0.,  0.],
           [ 0.,  1.,  0.],
           [ 0.,  0.,  1.],
           [ 0.,  1., nan],
           [ 1.,  0., nan]], dtype=float32)
    """
    new_size = len(new_var)
    old_size = len(cur_var)
    new_pts = new_var.get_indexer(cur_var)
    cur_pts = np.arange(len(new_pts))

    mask = new_pts != -1

    new_pts = new_pts[mask]
    cur_pts = cur_pts[mask]

    def reindexer(X, fill_value=fill_value):
        if not np.can_cast(fill_value, X.dtype):
            out_dtype = np.promote_types(np.array(fill_value).dtype, X.dtype)
        else:
            out_dtype = X.dtype

        idxmtx = sparse.coo_matrix(
            (np.ones(len(new_pts), dtype=int), (cur_pts, new_pts)),
            shape=(old_size, new_size),
            dtype=out_dtype,
        )
        out = X @ idxmtx

        if fill_value != 0:
            to_fill = new_var.get_indexer(new_var.difference(cur_var))
            if len(to_fill) > 0:
                # More efficient to set columns on csc
                if sparse.issparse(out):
                    out = sparse.csc_matrix(out)
                with warnings.catch_warnings():
                    warnings.simplefilter("ignore", sparse.SparseEfficiencyWarning)
                    out[:, to_fill] = fill_value

        return out

    return reindexer
Exemplo n.º 15
0
def union_categoricals(to_union, sort_categories=False):
    """
    Combine list-like of Categorical-like, unioning categories. All
    categories must have the same dtype.

    .. versionadded:: 0.19.0

    Parameters
    ----------
    to_union : list-like of Categorical, CategoricalIndex,
               or Series with dtype='category'
    sort_categories : boolean, default False
        If true, resulting categories will be lexsorted, otherwise
        they will be ordered as they appear in the data.

    Returns
    -------
    result : Categorical

    Raises
    ------
    TypeError
        - all inputs do not have the same dtype
        - all inputs do not have the same ordered property
        - all inputs are ordered and their categories are not identical
        - sort_categories=True and Categoricals are ordered
    ValueError
        Emmpty list of categoricals passed
    """
    from pandas import Index, Categorical, CategoricalIndex, Series

    if len(to_union) == 0:
        raise ValueError('No Categoricals to union')

    def _maybe_unwrap(x):
        if isinstance(x, (CategoricalIndex, Series)):
            return x.values
        elif isinstance(x, Categorical):
            return x
        else:
            raise TypeError("all components to combine must be Categorical")

    to_union = [_maybe_unwrap(x) for x in to_union]
    first = to_union[0]

    if not all(
            is_dtype_equal(other.categories.dtype, first.categories.dtype)
            for other in to_union[1:]):
        raise TypeError("dtype of categories must be the same")

    ordered = False
    if all(first.is_dtype_equal(other) for other in to_union[1:]):
        # identical categories - fastpath
        categories = first.categories
        ordered = first.ordered
        new_codes = np.concatenate([c.codes for c in to_union])

        if sort_categories and ordered:
            raise TypeError("Cannot use sort_categories=True with "
                            "ordered Categoricals")

        if sort_categories and not categories.is_monotonic_increasing:
            categories = categories.sort_values()
            indexer = categories.get_indexer(first.categories)
            new_codes = take_1d(indexer, new_codes, fill_value=-1)
    elif all(not c.ordered for c in to_union):
        # different categories - union and recode
        cats = first.categories.append([c.categories for c in to_union[1:]])
        categories = Index(cats.unique())
        if sort_categories:
            categories = categories.sort_values()

        new_codes = []
        for c in to_union:
            if len(c.categories) > 0:
                indexer = categories.get_indexer(c.categories)
                new_codes.append(take_1d(indexer, c.codes, fill_value=-1))
            else:
                # must be all NaN
                new_codes.append(c.codes)
        new_codes = np.concatenate(new_codes)
    else:
        # ordered - to show a proper error message
        if all(c.ordered for c in to_union):
            msg = ("to union ordered Categoricals, "
                   "all categories must be the same")
            raise TypeError(msg)
        else:
            raise TypeError('Categorical.ordered must be the same')

    return Categorical(new_codes,
                       categories=categories,
                       ordered=ordered,
                       fastpath=True)
Exemplo n.º 16
0
 def _process_pd_index(base_idx: pd.Index, index_1d: pd.Index) -> List[int]:
     if index_1d.has_duplicates:
         index_1d = index_1d.drop_duplicates()
     indexer = base_idx.get_indexer(index_1d)
     indexer = indexer[indexer >= 0]
     return indexer
Exemplo n.º 17
0
def union_categoricals(to_union, sort_categories=False, ignore_order=False):
    """
    Combine list-like of Categorical-like, unioning categories. All
    categories must have the same dtype.

    .. versionadded:: 0.19.0

    Parameters
    ----------
    to_union : list-like of Categorical, CategoricalIndex,
               or Series with dtype='category'
    sort_categories : boolean, default False
        If true, resulting categories will be lexsorted, otherwise
        they will be ordered as they appear in the data.
    ignore_order: boolean, default False
        If true, the ordered attribute of the Categoricals will be ignored.
        Results in an unordered categorical.

        .. versionadded:: 0.20.0

    Returns
    -------
    result : Categorical

    Raises
    ------
    TypeError
        - all inputs do not have the same dtype
        - all inputs do not have the same ordered property
        - all inputs are ordered and their categories are not identical
        - sort_categories=True and Categoricals are ordered
    ValueError
        Empty list of categoricals passed

    Notes
    -----

    To learn more about categories, see `link
    <http://pandas.pydata.org/pandas-docs/stable/categorical.html#unioning>`__

    Examples
    --------

    >>> from pandas.api.types import union_categoricals

    If you want to combine categoricals that do not necessarily have
    the same categories, `union_categoricals` will combine a list-like
    of categoricals. The new categories will be the union of the
    categories being combined.

    >>> a = pd.Categorical(["b", "c"])
    >>> b = pd.Categorical(["a", "b"])
    >>> union_categoricals([a, b])
    [b, c, a, b]
    Categories (3, object): [b, c, a]

    By default, the resulting categories will be ordered as they appear
    in the `categories` of the data. If you want the categories to be
    lexsorted, use `sort_categories=True` argument.

    >>> union_categoricals([a, b], sort_categories=True)
    [b, c, a, b]
    Categories (3, object): [a, b, c]

    `union_categoricals` also works with the case of combining two
    categoricals of the same categories and order information (e.g. what
    you could also `append` for).

    >>> a = pd.Categorical(["a", "b"], ordered=True)
    >>> b = pd.Categorical(["a", "b", "a"], ordered=True)
    >>> union_categoricals([a, b])
    [a, b, a, b, a]
    Categories (2, object): [a < b]

    Raises `TypeError` because the categories are ordered and not identical.

    >>> a = pd.Categorical(["a", "b"], ordered=True)
    >>> b = pd.Categorical(["a", "b", "c"], ordered=True)
    >>> union_categoricals([a, b])
    TypeError: to union ordered Categoricals, all categories must be the same

    New in version 0.20.0

    Ordered categoricals with different categories or orderings can be
    combined by using the `ignore_ordered=True` argument.

    >>> a = pd.Categorical(["a", "b", "c"], ordered=True)
    >>> b = pd.Categorical(["c", "b", "a"], ordered=True)
    >>> union_categoricals([a, b], ignore_order=True)
    [a, b, c, c, b, a]
    Categories (3, object): [a, b, c]

    `union_categoricals` also works with a `CategoricalIndex`, or `Series`
    containing categorical data, but note that the resulting array will
    always be a plain `Categorical`

    >>> a = pd.Series(["b", "c"], dtype='category')
    >>> b = pd.Series(["a", "b"], dtype='category')
    >>> union_categoricals([a, b])
    [b, c, a, b]
    Categories (3, object): [b, c, a]
    """
    from pandas import Index, Categorical, CategoricalIndex, Series

    if len(to_union) == 0:
        raise ValueError('No Categoricals to union')

    def _maybe_unwrap(x):
        if isinstance(x, (CategoricalIndex, Series)):
            return x.values
        elif isinstance(x, Categorical):
            return x
        else:
            raise TypeError("all components to combine must be Categorical")

    to_union = [_maybe_unwrap(x) for x in to_union]
    first = to_union[0]

    if not all(is_dtype_equal(other.categories.dtype, first.categories.dtype)
               for other in to_union[1:]):
        raise TypeError("dtype of categories must be the same")

    ordered = False
    if all(first.is_dtype_equal(other) for other in to_union[1:]):
        # identical categories - fastpath
        categories = first.categories
        ordered = first.ordered
        new_codes = np.concatenate([c.codes for c in to_union])

        if sort_categories and not ignore_order and ordered:
            raise TypeError("Cannot use sort_categories=True with "
                            "ordered Categoricals")

        if sort_categories and not categories.is_monotonic_increasing:
            categories = categories.sort_values()
            indexer = categories.get_indexer(first.categories)

            from pandas.core.algorithms import take_1d
            new_codes = take_1d(indexer, new_codes, fill_value=-1)
    elif ignore_order or all(not c.ordered for c in to_union):
        # different categories - union and recode
        cats = first.categories.append([c.categories for c in to_union[1:]])
        categories = Index(cats.unique())
        if sort_categories:
            categories = categories.sort_values()

        new_codes = []
        for c in to_union:
            if len(c.categories) > 0:
                indexer = categories.get_indexer(c.categories)

                from pandas.core.algorithms import take_1d
                new_codes.append(take_1d(indexer, c.codes, fill_value=-1))
            else:
                # must be all NaN
                new_codes.append(c.codes)
        new_codes = np.concatenate(new_codes)
    else:
        # ordered - to show a proper error message
        if all(c.ordered for c in to_union):
            msg = ("to union ordered Categoricals, "
                   "all categories must be the same")
            raise TypeError(msg)
        else:
            raise TypeError('Categorical.ordered must be the same')

    if ignore_order:
        ordered = False

    return Categorical(new_codes, categories=categories, ordered=ordered,
                       fastpath=True)