Пример #1
0
def factorize(values, sort=False, order=None, na_sentinel=-1):
    """
    Encode input values as an enumerated type or categorical variable

    Parameters
    ----------
    values : ndarray (1-d)
        Sequence
    sort : boolean, default False
        Sort by values
    order :
    na_sentinel: int, default -1
        Value to mark "not found"

    Returns
    -------
    """
    from pandas.tseries.period import PeriodIndex
    vals = np.asarray(values)
    is_datetime = com.is_datetime64_dtype(vals)
    (hash_klass, vec_klass), vals = _get_data_algo(vals, _hashtables)

    table = hash_klass(len(vals))
    uniques = vec_klass()
    labels = table.get_labels(vals, uniques, 0, na_sentinel)

    labels = com._ensure_platform_int(labels)

    uniques = uniques.to_array()

    if sort and len(uniques) > 0:
        try:
            sorter = uniques.argsort()
        except:
            # unorderable in py3 if mixed str/int
            t = hash_klass(len(uniques))
            t.map_locations(com._ensure_object(uniques))

            # order ints before strings
            ordered = np.concatenate([
                np.sort(np.array([ e for i, e in enumerate(uniques) if f(e) ],dtype=object)) for f in [ lambda x: not isinstance(x,string_types),
                                                                                                        lambda x: isinstance(x,string_types) ]
                ])
            sorter = com._ensure_platform_int(t.lookup(com._ensure_object(ordered)))

        reverse_indexer = np.empty(len(sorter), dtype=np.int_)
        reverse_indexer.put(sorter, np.arange(len(sorter)))

        mask = labels < 0
        labels = reverse_indexer.take(labels)
        np.putmask(labels, mask, -1)

        uniques = uniques.take(sorter)

    if is_datetime:
        uniques = uniques.astype('M8[ns]')
    if isinstance(values, PeriodIndex):
        uniques = PeriodIndex(ordinal=uniques, freq=values.freq)

    return labels, uniques
Пример #2
0
def values_at_time(obj, time, tz=None, asof=False):
    """
    Select values at particular time of day (e.g. 9:30AM)

    Parameters
    ----------
    time : datetime.time or string
    tz : string or pytz.timezone
        Time zone for time. Corresponding timestamps would be converted to
        time zone of the TimeSeries

    Returns
    -------
    values_at_time : TimeSeries
    """
    from dateutil.parser import parse

    if asof:
        raise NotImplementedError
    if tz:
        raise NotImplementedError

    if not isinstance(obj.index, DatetimeIndex):
        raise NotImplementedError

    if isinstance(time, basestring):
        time = parse(time).time()

    # TODO: time object with tzinfo?

    mus = _time_to_nanosecond(time)
    indexer = lib.values_at_time(obj.index.asi8, mus)
    indexer = com._ensure_platform_int(indexer)
    return obj.take(indexer)
Пример #3
0
    def remove_unused_categories(self, inplace=False):
        """ Removes categories which are not used.

        Parameters
        ----------
        inplace : boolean (default: False)
           Whether or not to drop unused categories inplace or return a copy of this categorical
           with unused categories dropped.

        Returns
        -------
        cat : Categorical with unused categories dropped or None if inplace.

        See also
        --------
        rename_categories
        reorder_categories
        add_categories
        remove_categories
        set_categories
        """
        cat = self if inplace else self.copy()
        _used = sorted(np.unique(cat._codes))
        new_categories = cat.categories.take(com._ensure_platform_int(_used))
        new_categories = _ensure_index(new_categories)
        cat._codes = _get_codes_for_values(cat.__array__(), new_categories)
        cat._categories = new_categories
        if not inplace:
            return cat
Пример #4
0
def factorize(values, sort=False, order=None, na_sentinel=-1):
    """
    Encode input values as an enumerated type or categorical variable

    Parameters
    ----------
    values : sequence
    sort :
    order :

    Returns
    -------
    """
    hash_klass, values = _get_data_algo(values, _hashtables)

    uniques = []
    table = hash_klass(len(values))
    labels, counts = table.get_labels(values, uniques, 0, na_sentinel)

    labels = com._ensure_platform_int(labels)

    uniques = com._asarray_tuplesafe(uniques)
    if sort and len(counts) > 0:
        sorter = uniques.argsort()
        reverse_indexer = np.empty(len(sorter), dtype=np.int_)
        reverse_indexer.put(sorter, np.arange(len(sorter)))

        mask = labels < 0
        labels = reverse_indexer.take(labels)
        np.putmask(labels, mask, -1)

        uniques = uniques.take(sorter)
        counts = counts.take(sorter)

    return labels, uniques, counts
Пример #5
0
    def _make_selectors(self):
        new_levels = self.new_index_levels

        # make the mask
        group_index = get_group_index(self.sorted_labels[:-1], [len(x) for x in new_levels])

        group_index = _ensure_platform_int(group_index)

        group_mask = np.zeros(self.full_shape[0], dtype=bool)
        group_mask.put(group_index, True)

        stride = self.index.levshape[self.level]
        selector = self.sorted_labels[-1] + stride * group_index
        mask = np.zeros(np.prod(self.full_shape), dtype=bool)
        mask.put(selector, True)

        # compress labels
        unique_groups = np.arange(self.full_shape[0])[group_mask]
        compressor = group_index.searchsorted(unique_groups)

        if mask.sum() < len(self.index):
            raise ReshapeError("Index contains duplicate entries, " "cannot reshape")

        self.group_mask = group_mask
        self.group_index = group_index
        self.mask = mask
        self.unique_groups = unique_groups
        self.compressor = compressor
Пример #6
0
    def _make_selectors(self):
        new_levels = self.new_index_levels

        # make the mask
        remaining_labels = self.sorted_labels[:-1]
        level_sizes = [len(x) for x in new_levels]

        comp_index, obs_ids = get_compressed_ids(remaining_labels, level_sizes)
        ngroups = len(obs_ids)

        comp_index = _ensure_platform_int(comp_index)
        stride = self.index.levshape[self.level]
        self.full_shape = ngroups, stride

        selector = self.sorted_labels[-1] + stride * comp_index
        mask = np.zeros(np.prod(self.full_shape), dtype=bool)
        mask.put(selector, True)

        if mask.sum() < len(self.index):
            raise ReshapeError('Index contains duplicate entries, '
                               'cannot reshape')

        self.group_index = comp_index
        self.mask = mask
        self.unique_groups = obs_ids
        self.compressor = comp_index.searchsorted(np.arange(ngroups))
Пример #7
0
 def take(self, indices, axis=0, allow_fill=True, fill_value=None):
     """
     Analogous to ndarray.take
     """
     indices = com._ensure_platform_int(indices)
     taken = self.asi8.take(indices, axis=axis)
     return self._simple_new(taken, self.name, freq=self.freq)
Пример #8
0
 def take(self, indices, axis=0, allow_fill=True, fill_value=None):
     indices = com._ensure_platform_int(indices)
     taken = self._assert_take_fillable(self.codes, indices,
                                        allow_fill=allow_fill,
                                        fill_value=fill_value,
                                        na_value=-1)
     return self._create_from_codes(taken)
Пример #9
0
def factor_indexer(shape, labels):
    """ given a tuple of shape and a list of Categorical labels, return the
    expanded label indexer
    """
    mult = np.array(shape)[::-1].cumprod()[::-1]
    return com._ensure_platform_int(
        np.sum(np.array(labels).T * np.append(mult, [1]), axis=1).T)
Пример #10
0
 def take(self, indices, axis=0, allow_fill=True, fill_value=None):
     indices = com._ensure_platform_int(indices)
     taken = self._assert_take_fillable(self.codes, indices,
                                        allow_fill=allow_fill,
                                        fill_value=fill_value,
                                        na_value=-1)
     return self._create_from_codes(taken)
Пример #11
0
 def ref_locs(self):
     if self._ref_locs is None:
         indexer = self.ref_items.get_indexer(self.items)
         indexer = com._ensure_platform_int(indexer)
         assert ((indexer != -1).all())
         self._ref_locs = indexer
     return self._ref_locs
Пример #12
0
 def take(self, indices, axis=0, allow_fill=True, fill_value=None):
     """
     Analogous to ndarray.take
     """
     indices = com._ensure_platform_int(indices)
     taken = self.asi8.take(indices, axis=axis)
     return self._simple_new(taken, self.name, freq=self.freq)
Пример #13
0
 def ref_locs(self):
     if self._ref_locs is None:
         indexer = self.ref_items.get_indexer(self.items)
         indexer = com._ensure_platform_int(indexer)
         assert((indexer != -1).all())
         self._ref_locs = indexer
     return self._ref_locs
Пример #14
0
    def remove_unused_categories(self, inplace=False):
        """ Removes categories which are not used.

        Parameters
        ----------
        inplace : boolean (default: False)
           Whether or not to drop unused categories inplace or return a copy of this categorical
           with unused categories dropped.

        Returns
        -------
        cat : Categorical with unused categories dropped or None if inplace.

        See also
        --------
        rename_categories
        reorder_categories
        add_categories
        remove_categories
        set_categories
        """
        cat = self if inplace else self.copy()
        _used = sorted(np.unique(cat._codes))
        new_categories = cat.categories.take(com._ensure_platform_int(_used))
        new_categories = _ensure_index(new_categories)
        cat._codes = _get_codes_for_values(cat.__array__(), new_categories)
        cat._categories = new_categories
        if not inplace:
            return cat
Пример #15
0
    def _reindex_index(self, index, method, copy, level, fill_value=np.nan,
                       limit=None):
        if level is not None:
            raise Exception('Reindex by level not supported for sparse')

        if self.index.equals(index):
            if copy:
                return self.copy()
            else:
                return self

        if len(self.index) == 0:
            return SparseDataFrame(index=index, columns=self.columns)

        indexer = self.index.get_indexer(index, method, limit=limit)
        indexer = com._ensure_platform_int(indexer)
        mask = indexer == -1
        need_mask = mask.any()

        new_series = {}
        for col, series in self.iteritems():
            values = series.values
            new = values.take(indexer)

            if need_mask:
                np.putmask(new, mask, fill_value)

            new_series[col] = new

        return SparseDataFrame(new_series, index=index, columns=self.columns,
                               default_fill_value=self.default_fill_value)
Пример #16
0
    def take(self, indices, axis=0, convert=True):
        """
        Analogous to ndarray.take, return SparseDataFrame corresponding to
        requested indices along an axis

        Parameters
        ----------
        indices : list / array of ints
        axis : {0, 1}
        convert : convert indices for negative values, check bounds, default True
                  mainly useful for an user routine calling

        Returns
        -------
        taken : SparseDataFrame
        """

        indices = com._ensure_platform_int(indices)

        # check/convert indicies here
        if convert:
            indices = _maybe_convert_indices(indices, len(self._get_axis(axis)))

        new_values = self.values.take(indices, axis=axis)
        if axis == 0:
            new_columns = self.columns
            new_index = self.index.take(indices)
        else:
            new_columns = self.columns.take(indices)
            new_index = self.index
        return self._constructor(new_values, index=new_index,
                                 columns=new_columns)
Пример #17
0
 def take(self, indices, axis=None):
     """
     Analogous to ndarray.take
     """
     indices = com._ensure_platform_int(indices)
     taken = self.values.take(indices, axis=axis)
     return self._simple_new(taken, self.name, freq=self.freq)
Пример #18
0
    def take(self, indices, axis=0):
        """
        Analogous to ndarray.take, return SparseDataFrame corresponding to
        requested indices along an axis

        Parameters
        ----------
        indices : list / array of ints
        axis : {0, 1}

        Returns
        -------
        taken : SparseDataFrame
        """
        indices = com._ensure_platform_int(indices)
        new_values = self.values.take(indices, axis=axis)
        if axis == 0:
            new_columns = self.columns
            new_index = self.index.take(indices)
        else:
            new_columns = self.columns.take(indices)
            new_index = self.index
        return self._constructor(new_values,
                                 index=new_index,
                                 columns=new_columns)
Пример #19
0
 def take(self, indices, axis=None):
     """
     Analogous to ndarray.take
     """
     indices = com._ensure_platform_int(indices)
     taken = self.values.take(indices, axis=axis)
     return self._simple_new(taken, self.name, freq=self.freq)
Пример #20
0
    def _make_selectors(self):
        new_levels = self.new_index_levels

        # make the mask
        remaining_labels = self.sorted_labels[:-1]
        level_sizes = [len(x) for x in new_levels]

        comp_index, obs_ids = get_compressed_ids(remaining_labels, level_sizes)
        ngroups = len(obs_ids)

        comp_index = _ensure_platform_int(comp_index)
        stride = self.index.levshape[self.level]
        self.full_shape = ngroups, stride

        selector = self.sorted_labels[-1] + stride * comp_index
        mask = np.zeros(np.prod(self.full_shape), dtype=bool)
        mask.put(selector, True)

        if mask.sum() < len(self.index):
            raise ReshapeError('Index contains duplicate entries, '
                               'cannot reshape')

        self.group_index = comp_index
        self.mask = mask
        self.unique_groups = obs_ids
        self.compressor = comp_index.searchsorted(np.arange(ngroups))
Пример #21
0
def factor_indexer(shape, labels):
    """ given a tuple of shape and a list of Categorical labels, return the
    expanded label indexer
    """
    mult = np.array(shape)[::-1].cumprod()[::-1]
    return com._ensure_platform_int(
        np.sum(np.array(labels).T * np.append(mult, [1]), axis=1).T)
Пример #22
0
    def _make_selectors(self):
        new_levels = self.new_index_levels

        # make the mask
        group_index = get_group_index(self.sorted_labels[:-1],
                                      [len(x) for x in new_levels])

        group_index = _ensure_platform_int(group_index)

        group_mask = np.zeros(self.full_shape[0], dtype=bool)
        group_mask.put(group_index, True)

        stride = self.index.levshape[self.level]
        selector = self.sorted_labels[-1] + stride * group_index
        mask = np.zeros(np.prod(self.full_shape), dtype=bool)
        mask.put(selector, True)

        # compress labels
        unique_groups = np.arange(self.full_shape[0])[group_mask]
        compressor = group_index.searchsorted(unique_groups)

        if mask.sum() < len(self.index):
            raise ReshapeError('Index contains duplicate entries, '
                               'cannot reshape')

        self.group_mask = group_mask
        self.group_index = group_index
        self.mask = mask
        self.unique_groups = unique_groups
        self.compressor = compressor
Пример #23
0
def values_at_time(obj, time, tz=None, asof=False):
    """
    Select values at particular time of day (e.g. 9:30AM)

    Parameters
    ----------
    time : datetime.time or string
    tz : string or pytz.timezone
        Time zone for time. Corresponding timestamps would be converted to
        time zone of the TimeSeries

    Returns
    -------
    values_at_time : TimeSeries
    """
    from dateutil.parser import parse

    if asof:
        raise NotImplementedError
    if tz:
        raise NotImplementedError

    if not isinstance(obj.index, DatetimeIndex):
        raise NotImplementedError

    if isinstance(time, basestring):
        time = parse(time).time()

    # TODO: time object with tzinfo?

    mus = _time_to_nanosecond(time)
    indexer = lib.values_at_time(obj.index.asi8, mus)
    indexer = com._ensure_platform_int(indexer)
    return obj.take(indexer)
Пример #24
0
    def _reindex_index(self, index, method, copy, level, fill_value=np.nan,
                       limit=None, takeable=False):
        if level is not None:
            raise TypeError('Reindex by level not supported for sparse')

        if self.index.equals(index):
            if copy:
                return self.copy()
            else:
                return self

        if len(self.index) == 0:
            return SparseDataFrame(index=index, columns=self.columns)

        indexer = self.index.get_indexer(index, method, limit=limit)
        indexer = com._ensure_platform_int(indexer)
        mask = indexer == -1
        need_mask = mask.any()

        new_series = {}
        for col, series in compat.iteritems(self):
            values = series.values
            new = values.take(indexer)

            if need_mask:
                np.putmask(new, mask, fill_value)

            new_series[col] = new

        return SparseDataFrame(new_series, index=index, columns=self.columns,
                               default_fill_value=self.default_fill_value)
Пример #25
0
 def ref_locs(self):
     if self._ref_locs is None:
         indexer = self.ref_items.get_indexer(self.items)
         indexer = com._ensure_platform_int(indexer)
         if (indexer == -1).any():
             raise AssertionError("Some block items were not in block " "ref_items")
         self._ref_locs = indexer
     return self._ref_locs
Пример #26
0
    def _indices_at_time(self, key):
        from dateutil.parser import parse

        # TODO: time object with tzinfo?

        nanos = _time_to_nanosecond(key)
        indexer = lib.values_at_time(self.asi8, nanos)
        return com._ensure_platform_int(indexer)
Пример #27
0
def _sort_labels(uniques, left, right):
    if not isinstance(uniques, np.ndarray):
        # tuplesafe
        uniques = Index(uniques).values

    sorter = uniques.argsort()

    reverse_indexer = np.empty(len(sorter), dtype=np.int64)
    reverse_indexer.put(sorter, np.arange(len(sorter)))

    new_left = reverse_indexer.take(com._ensure_platform_int(left))
    np.putmask(new_left, left == -1, -1)

    new_right = reverse_indexer.take(com._ensure_platform_int(right))
    np.putmask(new_right, right == -1, -1)

    return new_left, new_right
Пример #28
0
    def _indices_at_time(self, key):
        from dateutil.parser import parse

        # TODO: time object with tzinfo?

        nanos = _time_to_nanosecond(key)
        indexer = lib.values_at_time(self.asi8, nanos)
        return com._ensure_platform_int(indexer)
Пример #29
0
def _get_codes_for_values(values, levels):
    from pandas.core.algorithms import _get_data_algo, _hashtables
    if values.dtype != levels.dtype:
        values = com._ensure_object(values)
        levels = com._ensure_object(levels)
    (hash_klass, vec_klass), vals = _get_data_algo(values, _hashtables)
    t = hash_klass(len(levels))
    t.map_locations(levels)
    return com._ensure_platform_int(t.lookup(values))
Пример #30
0
 def ref_locs(self):
     if self._ref_locs is None:
         indexer = self.ref_items.get_indexer(self.items)
         indexer = com._ensure_platform_int(indexer)
         if (indexer == -1).any():
             raise AssertionError('Some block items were not in block '
                                  'ref_items')
         self._ref_locs = indexer
     return self._ref_locs
Пример #31
0
def _get_codes_for_values(values, levels):
    from pandas.core.algorithms import _get_data_algo, _hashtables
    if values.dtype != levels.dtype:
        values = com._ensure_object(values)
        levels = com._ensure_object(levels)
    (hash_klass, vec_klass), vals = _get_data_algo(values, _hashtables)
    t = hash_klass(len(levels))
    t.map_locations(levels)
    return com._ensure_platform_int(t.lookup(values))
Пример #32
0
    def _get_group_levels(self, mask, obs_ids):
        recons_labels = decons_group_index(obs_ids, self._group_shape)

        name_list = []
        for ping, labels in zip(self.groupings, recons_labels):
            labels = com._ensure_platform_int(labels)
            name_list.append((ping.name, ping.group_index.take(labels)))

        return name_list
Пример #33
0
    def take(self,
             indices,
             axis=0,
             allow_fill=True,
             fill_value=None,
             **kwargs):
        """
        Sparse-compatible version of ndarray.take

        Returns
        -------
        taken : ndarray
        """
        nv.validate_take(tuple(), kwargs)

        if axis:
            raise ValueError("axis must be 0, input was {0}".format(axis))

        if com.is_integer(indices):
            # return scalar
            return self[indices]

        indices = com._ensure_platform_int(indices)
        n = len(self)
        if allow_fill and fill_value is not None:
            # allow -1 to indicate self.fill_value,
            # self.fill_value may not be NaN
            if (indices < -1).any():
                msg = ('When allow_fill=True and fill_value is not None, '
                       'all indices must be >= -1')
                raise ValueError(msg)
            elif (n <= indices).any():
                msg = 'index is out of bounds for size {0}'
                raise IndexError(msg.format(n))
        else:
            if ((indices < -n) | (n <= indices)).any():
                msg = 'index is out of bounds for size {0}'
                raise IndexError(msg.format(n))

        indices = indices.astype(np.int32)
        if not (allow_fill and fill_value is not None):
            indices = indices.copy()
            indices[indices < 0] += n

        locs = self.sp_index.lookup_array(indices)
        indexer = np.arange(len(locs), dtype=np.int32)
        mask = locs != -1
        if mask.any():
            indexer = indexer[mask]
            new_values = self.sp_values.take(locs[mask])
        else:
            indexer = np.empty(shape=(0, ), dtype=np.int32)
            new_values = np.empty(shape=(0, ), dtype=self.sp_values.dtype)

        sp_index = _make_index(len(indices), indexer, kind=self.sp_index)
        return self._simple_new(new_values, sp_index, self.fill_value)
Пример #34
0
 def take(self, indices, axis=0):
     """
     Analogous to ndarray.take
     """
     maybe_slice = lib.maybe_indices_to_slice(com._ensure_int64(indices))
     if isinstance(maybe_slice, slice):
         return self[maybe_slice]
     indices = com._ensure_platform_int(indices)
     taken = self.values.take(indices, axis=axis)
     return DatetimeIndex(taken, tz=self.tz, name=self.name)
Пример #35
0
 def take(self, indices, axis=0):
     """
     Analogous to ndarray.take
     """
     maybe_slice = lib.maybe_indices_to_slice(com._ensure_int64(indices))
     if isinstance(maybe_slice, slice):
         return self[maybe_slice]
     indices = com._ensure_platform_int(indices)
     taken = self.values.take(indices, axis=axis)
     return DatetimeIndex(taken, tz=self.tz, name=self.name)
Пример #36
0
    def remove_unused_levels(self):
        """ Removes levels which are not used.

        The level removal is done inplace.
        """
        _used = sorted(np.unique(self._codes))
        new_levels = self.levels.take(com._ensure_platform_int(_used))
        new_levels = _ensure_index(new_levels)
        self._codes = _get_codes_for_values(self.__array__(), new_levels)
        self._levels = new_levels
Пример #37
0
 def take(self, indices, axis=0):
     """
     Analogous to ndarray.take
     """
     maybe_slice = lib.maybe_indices_to_slice(com._ensure_int64(indices))
     if isinstance(maybe_slice, slice):
         return self[maybe_slice]
     indices = com._ensure_platform_int(indices)
     taken = self.values.take(indices, axis=axis)
     return self._simple_new(taken, self.name, None, self.tz)
Пример #38
0
def test_ensure_platform_int():

    # verify that when we create certain types of indices
    # they remain the correct type under platform conversions
    from pandas.core.index import Int64Index

    # int64
    x = Int64Index([1, 2, 3], dtype='int64')
    assert (x.dtype == np.int64)

    pi = com._ensure_platform_int(x)
    assert (pi.dtype == np.int_)

    # int32
    x = Int64Index([1, 2, 3], dtype='int32')
    assert (x.dtype == np.int32)

    pi = com._ensure_platform_int(x)
    assert (pi.dtype == np.int_)
Пример #39
0
 def take(self, indices, axis=0, **kwargs):
     """
     Analogous to ndarray.take
     """
     indices = com._ensure_int64(indices)
     maybe_slice = lib.maybe_indices_to_slice(indices, len(self))
     if isinstance(maybe_slice, slice):
         return self[maybe_slice]
     taken = self.asi8.take(com._ensure_platform_int(indices))
     return self._shallow_copy(taken, freq=None)
Пример #40
0
    def get_group_levels(self):
        obs_ids = self.group_info[1]
        recons_labels = decons_group_index(obs_ids, self.shape)

        name_list = []
        for ping, labels in zip(self.groupings, recons_labels):
            labels = com._ensure_platform_int(labels)
            name_list.append(ping.group_index.take(labels))

        return name_list
Пример #41
0
 def take(self, indices, axis=None):
     """
     Analogous to ndarray.take
     """
     indices = com._ensure_platform_int(indices)
     taken = self.values.take(indices, axis=axis)
     taken = taken.view(PeriodIndex)
     taken.freq = self.freq
     taken.name = self.name
     return taken
Пример #42
0
 def take(self, indices, axis=0):
     """
     Analogous to ndarray.take
     """
     maybe_slice = lib.maybe_indices_to_slice(com._ensure_int64(indices))
     if isinstance(maybe_slice, slice):
         return self[maybe_slice]
     indices = com._ensure_platform_int(indices)
     taken = self.values.take(indices, axis=axis)
     return self._simple_new(taken, self.name, None, self.tz)
Пример #43
0
    def get_group_levels(self):
        obs_ids = self.group_info[1]
        recons_labels = decons_group_index(obs_ids, self.shape)

        name_list = []
        for ping, labels in zip(self.groupings, recons_labels):
            labels = com._ensure_platform_int(labels)
            name_list.append(ping.group_index.take(labels))

        return name_list
Пример #44
0
    def remove_unused_levels(self):
        """ Removes levels which are not used.

        The level removal is done inplace.
        """
        _used = sorted(np.unique(self._codes))
        new_levels = self.levels.take(com._ensure_platform_int(_used))
        new_levels = _ensure_index(new_levels)
        self._codes = _get_codes_for_values(self.__array__(), new_levels)
        self._levels = new_levels
Пример #45
0
 def take(self, indices, axis=None):
     """
     Analogous to ndarray.take
     """
     indices = com._ensure_platform_int(indices)
     taken = self.values.take(indices, axis=axis)
     taken = taken.view(PeriodIndex)
     taken.freq = self.freq
     taken.name = self.name
     return taken
Пример #46
0
 def take(self, indices, axis=0):
     """
     Analogous to ndarray.take
     """
     indices = com._ensure_int64(indices)
     maybe_slice = lib.maybe_indices_to_slice(indices, len(self))
     if isinstance(maybe_slice, slice):
         return self[maybe_slice]
     taken = self.asi8.take(com._ensure_platform_int(indices))
     return self._shallow_copy(taken, freq=None)
Пример #47
0
def test_ensure_platform_int():

    # verify that when we create certain types of indices
    # they remain the correct type under platform conversions
    from pandas.core.index import Int64Index

    # int64
    x = Int64Index([1, 2, 3], dtype='int64')
    assert(x.dtype == np.int64)

    pi = com._ensure_platform_int(x)
    assert(pi.dtype == np.int_)

    # int32
    x = Int64Index([1, 2, 3], dtype='int32')
    assert(x.dtype == np.int32)

    pi = com._ensure_platform_int(x)
    assert(pi.dtype == np.int_)
Пример #48
0
def factorize(values, sort=False, order=None, na_sentinel=-1):
    """
    Encode input values as an enumerated type or categorical variable

    Parameters
    ----------
    values : array-like
    sort : boolean, default True
        Sort by values
    order :,optional
    na_sentinel : int, default -1
        Value to mark "not found"
    Examples
    --------
    >>> factorize([12,3,8,5,9,7,11],sort=True,order=None,na_sentinel=-1)
    (array([6, 0, 3, 1, 4, 2, 5]), array([ 3,  5,  7,  8,  9, 11, 12], dtype=int64))
    >>> factorize([12,3,8,5,9,7,10],sort=False,order=None,na_sentinel=-1)
    (array([0, 1, 2, 3, 4, 5, 6]), array([12,  3,  8,  5,  9,  7, 10], dtype=int64))
    >>> factorize([12,3,8,5,9,7,10,10],sort=False,order=None,na_sentinel=-1) 
    (array([0, 1, 2, 3, 4, 5, 6, 6]), array([12,  3,  8,  5,  9,  7, 10], dtype=int64))
    Returns
    -------
    a tuple
    labels of each number in array form ,corresponding number without duplication
    """
    from pandas.tseries.period import PeriodIndex
    vals = np.asarray(values)
    is_datetime = com.is_datetime64_dtype(vals)
    (hash_klass, vec_klass), vals = _get_data_algo(vals, _hashtables)

    table = hash_klass(len(vals))
    uniques = vec_klass()
    labels = table.get_labels(vals, uniques, 0, na_sentinel)

    labels = com._ensure_platform_int(labels)

    uniques = uniques.to_array()

    if sort and len(uniques) > 0:
        sorter = uniques.argsort()
        reverse_indexer = np.empty(len(sorter), dtype=np.int_)
        reverse_indexer.put(sorter, np.arange(len(sorter)))

        mask = labels < 0
        labels = reverse_indexer.take(labels)
        np.putmask(labels, mask, -1)

        uniques = uniques.take(sorter)

    if is_datetime:
        uniques = uniques.view('M8[ns]')
    if isinstance(values, PeriodIndex):
        uniques = PeriodIndex(ordinal=uniques, freq=values.freq)

    return labels, uniques
Пример #49
0
    def describe(self):
        """ Describes this Categorical

        Returns
        -------
        description: `DataFrame`
            A dataframe with frequency and counts by level.
        """
        # Hack?
        from pandas.core.frame import DataFrame
        counts = DataFrame({
            'codes': self._codes,
            'values': self._codes
        }).groupby('codes').count()

        freqs = counts / float(counts.sum())

        from pandas.tools.merge import concat
        result = concat([counts, freqs], axis=1)
        result.columns = ['counts', 'freqs']

        # fill in the real levels
        check = result.index == -1
        if check.any():
            # Sort -1 (=NaN) to the last position
            index = np.arange(0, len(self.levels) + 1, dtype='int64')
            index[-1] = -1
            result = result.reindex(index)
            # build new index
            levels = np.arange(0, len(self.levels) + 1, dtype=object)
            levels[:-1] = self.levels
            levels[-1] = np.nan
            result.index = levels.take(com._ensure_platform_int(result.index))
        else:
            result.index = self.levels.take(
                com._ensure_platform_int(result.index))
            result = result.reindex(self.levels)
        result.index.name = 'levels'

        return result
Пример #50
0
    def take(self, indices, axis=0, allow_fill=True,
             fill_value=None, **kwargs):
        """
        Sparse-compatible version of ndarray.take

        Returns
        -------
        taken : ndarray
        """
        nv.validate_take(tuple(), kwargs)

        if axis:
            raise ValueError("axis must be 0, input was {0}".format(axis))

        if com.is_integer(indices):
            # return scalar
            return self[indices]

        indices = com._ensure_platform_int(indices)
        n = len(self)
        if allow_fill and fill_value is not None:
            # allow -1 to indicate self.fill_value,
            # self.fill_value may not be NaN
            if (indices < -1).any():
                msg = ('When allow_fill=True and fill_value is not None, '
                       'all indices must be >= -1')
                raise ValueError(msg)
            elif (n <= indices).any():
                msg = 'index is out of bounds for size {0}'
                raise IndexError(msg.format(n))
        else:
            if ((indices < -n) | (n <= indices)).any():
                msg = 'index is out of bounds for size {0}'
                raise IndexError(msg.format(n))

        indices = indices.astype(np.int32)
        if not (allow_fill and fill_value is not None):
            indices = indices.copy()
            indices[indices < 0] += n

        locs = self.sp_index.lookup_array(indices)
        indexer = np.arange(len(locs), dtype=np.int32)
        mask = locs != -1
        if mask.any():
            indexer = indexer[mask]
            new_values = self.sp_values.take(locs[mask])
        else:
            indexer = np.empty(shape=(0, ), dtype=np.int32)
            new_values = np.empty(shape=(0, ), dtype=self.sp_values.dtype)

        sp_index = _make_index(len(indices), indexer, kind=self.sp_index)
        return self._simple_new(new_values, sp_index, self.fill_value)
Пример #51
0
def _get_codes_for_values(values, levels):
    """"
    utility routine to turn values into codes given the specified levels
    """

    from pandas.core.algorithms import _get_data_algo, _hashtables
    if values.dtype != levels.dtype:
        values = com._ensure_object(values)
        levels = com._ensure_object(levels)
    (hash_klass, vec_klass), vals = _get_data_algo(values, _hashtables)
    t = hash_klass(len(levels))
    t.map_locations(com._values_from_object(levels))
    return com._ensure_platform_int(t.lookup(values))
Пример #52
0
    def describe(self):
        """ Describes this Categorical

        Returns
        -------
        description: `DataFrame`
            A dataframe with frequency and counts by category.
        """
        # Hack?
        from pandas.core.frame import DataFrame
        counts = DataFrame({
            'codes' : self._codes,
            'values' : self._codes }
                           ).groupby('codes').count()

        freqs = counts / float(counts.sum())

        from pandas.tools.merge import concat
        result = concat([counts,freqs],axis=1)
        result.columns = ['counts','freqs']

        # fill in the real categories
        check = result.index == -1
        if check.any():
            # Sort -1 (=NaN) to the last position
            index = np.arange(0, len(self.categories)+1, dtype='int64')
            index[-1] = -1
            result = result.reindex(index)
            # build new index
            categories = np.arange(0,len(self.categories)+1 ,dtype=object)
            categories[:-1] = self.categories
            categories[-1] = np.nan
            result.index = categories.take(com._ensure_platform_int(result.index))
        else:
            result.index = self.categories.take(com._ensure_platform_int(result.index))
            result = result.reindex(self.categories)
        result.index.name = 'categories'

        return result
Пример #53
0
def _get_codes_for_values(values, categories):
    """"
    utility routine to turn values into codes given the specified categories
    """

    from pandas.core.algorithms import _get_data_algo, _hashtables
    if values.dtype != categories.dtype:
        values = com._ensure_object(values)
        categories = com._ensure_object(categories)
    (hash_klass, vec_klass), vals = _get_data_algo(values, _hashtables)
    t = hash_klass(len(categories))
    t.map_locations(com._values_from_object(categories))
    return com._ensure_platform_int(t.lookup(values))
Пример #54
0
    def get_group_levels(self):
        obs_ids = self.group_info[1]
        if self._overflow_possible:
            recons_labels = [np.array(x) for x in izip(*obs_ids)]
        else:
            recons_labels = decons_group_index(obs_ids, self.shape)

        name_list = []
        for ping, labels in zip(self.groupings, recons_labels):
            labels = com._ensure_platform_int(labels)
            name_list.append(ping.group_index.take(labels))

        return name_list
Пример #55
0
def _take_nd_generic(arr, indexer, out, axis, fill_value, mask_info):
    if mask_info is not None:
        mask, needs_masking = mask_info
    else:
        mask = indexer == -1
        needs_masking = mask.any()
    if arr.dtype != out.dtype:
        arr = arr.astype(out.dtype)
    if arr.shape[axis] > 0:
        arr.take(com._ensure_platform_int(indexer), axis=axis, out=out)
    if needs_masking:
        outindexer = [slice(None)] * arr.ndim
        outindexer[axis] = mask
        out[tuple(outindexer)] = fill_value
Пример #56
0
    def take(self, indexer, axis=0, allow_fill=True, fill_value=None):
        """
        For internal compatibility with numpy arrays.

        # filling must always be None/nan here
        # but is passed thru internally
        assert isnull(fill_value)

        See also
        --------
        numpy.ndarray.take
        """

        indexer = com._ensure_platform_int(indexer)
        taken = self.codes.take(indexer)
        return self._create_from_codes(taken)
Пример #57
0
    def _make_sorted_values_labels(self):
        v = self.level

        labs = list(self.index.labels)
        levs = list(self.index.levels)
        to_sort = labs[:v] + labs[v + 1:] + [labs[v]]
        sizes = [len(x) for x in levs[:v] + levs[v + 1:] + [levs[v]]]

        comp_index, obs_ids = get_compressed_ids(to_sort, sizes)
        ngroups = len(obs_ids)

        indexer = algos.groupsort_indexer(comp_index, ngroups)[0]
        indexer = _ensure_platform_int(indexer)

        self.sorted_values = com.take_nd(self.values, indexer, axis=0)
        self.sorted_labels = [l.take(indexer) for l in to_sort]
Пример #58
0
    def take(self, indices, axis=0, allow_fill=True, fill_value=None):
        """
        Analogous to ndarray.take
        """
        indices = com._ensure_int64(indices)
        maybe_slice = lib.maybe_indices_to_slice(indices, len(self))
        if isinstance(maybe_slice, slice):
            return self[maybe_slice]
        taken = self.asi8.take(com._ensure_platform_int(indices))

        # only fill if we are passing a non-None fill_value
        if allow_fill and fill_value is not None:
            mask = indices == -1
            if mask.any():
                taken[mask] = tslib.iNaT
        return self._shallow_copy(taken, freq=None)