def _factorize_keys(lk, rk, sort=True): if com._is_int_or_datetime_dtype(lk) and com._is_int_or_datetime_dtype(rk): klass = lib.Int64Factorizer lk = com._ensure_int64(lk) rk = com._ensure_int64(rk) else: klass = lib.Factorizer lk = com._ensure_object(lk) rk = com._ensure_object(rk) rizer = klass(max(len(lk), len(rk))) llab = rizer.factorize(lk) rlab = rizer.factorize(rk) count = rizer.get_count() if sort: uniques = rizer.uniques.to_array() llab, rlab = _sort_labels(uniques, llab, rlab) # NA group lmask = llab == -1; lany = lmask.any() rmask = rlab == -1; rany = rmask.any() if lany or rany: if lany: np.putmask(llab, lmask, count) if rany: np.putmask(rlab, rmask, count) count += 1 return llab, rlab, count
def _get_single_indexer(join_key, index, sort=False): left_key, right_key, count = _factorize_keys(join_key, index, sort=sort) left_indexer, right_indexer = algos.left_outer_join( com._ensure_int64(left_key), com._ensure_int64(right_key), count, sort=sort ) return left_indexer, right_indexer
def _from_arraylike(cls, data, freq, tz): if not isinstance(data, (np.ndarray, PeriodIndex, DatetimeIndex, Int64Index)): if np.isscalar(data) or isinstance(data, Period): raise ValueError('PeriodIndex() must be called with a ' 'collection of some kind, %s was passed' % repr(data)) # other iterable of some kind if not isinstance(data, (list, tuple)): data = list(data) try: data = com._ensure_int64(data) if freq is None: raise ValueError('freq not specified') data = np.array([Period(x, freq=freq).ordinal for x in data], dtype=np.int64) except (TypeError, ValueError): data = com._ensure_object(data) if freq is None and len(data) > 0: freq = getattr(data[0], 'freq', None) if freq is None: raise ValueError('freq not specified and cannot be ' 'inferred from first element') data = _get_ordinals(data, freq) else: if isinstance(data, PeriodIndex): if freq is None or freq == data.freq: freq = data.freq data = data.values else: base1, _ = _gfc(data.freq) base2, _ = _gfc(freq) data = period.period_asfreq_arr(data.values, base1, base2, 1) else: if freq is None and len(data) > 0: freq = getattr(data[0], 'freq', None) if freq is None: raise ValueError('freq not specified and cannot be ' 'inferred from first element') if data.dtype != np.int64: if np.issubdtype(data.dtype, np.datetime64): data = dt64arr_to_periodarr(data, freq, tz) else: try: data = com._ensure_int64(data) except (TypeError, ValueError): data = com._ensure_object(data) data = _get_ordinals(data, freq) return data, freq
def mode(values): """Returns the mode or mode(s) of the passed Series or ndarray (sorted)""" # must sort because hash order isn't necessarily defined. from pandas.core.series import Series if isinstance(values, Series): constructor = values._constructor values = values.values else: values = np.asanyarray(values) constructor = Series dtype = values.dtype if com.is_integer_dtype(values.dtype): values = com._ensure_int64(values) result = constructor(sorted(htable.mode_int64(values)), dtype=dtype) elif issubclass(values.dtype.type, (np.datetime64, np.timedelta64)): dtype = values.dtype values = values.view(np.int64) result = constructor(sorted(htable.mode_int64(values)), dtype=dtype) else: mask = com.isnull(values) values = com._ensure_object(values) res = htable.mode_object(values, mask) try: res = sorted(res) except TypeError as e: warn("Unable to sort modes: %s" % e) result = constructor(res, dtype=dtype) return result
def _get_data_algo(values, func_map): mask = None if com.is_float_dtype(values): f = func_map['float64'] values = com._ensure_float64(values) elif com.needs_i8_conversion(values): # if we have NaT, punt to object dtype mask = com.isnull(values) if mask.ravel().any(): f = func_map['generic'] values = com._ensure_object(values) values[mask] = np.nan else: f = func_map['int64'] values = values.view('i8') elif com.is_integer_dtype(values): f = func_map['int64'] values = com._ensure_int64(values) else: f = func_map['generic'] values = com._ensure_object(values) return f, values
def get_result(self): # TODO: find a better way than this masking business values, value_mask = self.get_new_values() columns = self.get_new_columns() index = self.get_new_index() # filter out missing levels if values.shape[1] > 0: col_inds, obs_ids = _compress_group_index(self.sorted_labels[-1]) # rare case, level values not observed if len(obs_ids) < self.full_shape[1]: inds = (value_mask.sum(0) > 0).nonzero()[0] values = com.take_nd(values, inds, axis=1) columns = columns[inds] # we might have a missing index if len(index) != values.shape[0]: mask = isnull(index) if mask.any(): l = np.arange(len(index)) values, orig_values = (np.empty((len(index), values.shape[1])), values) values.fill(np.nan) values_indexer = com._ensure_int64(l[~mask]) for i, j in enumerate(values_indexer): values[j] = orig_values[i] else: index = index.take(self.unique_groups) return DataFrame(values, index=index, columns=columns)
def _check(dtype): obj = np.array(np.random.randn(20), dtype=dtype) bins = np.array([6, 12, 20]) out = np.zeros((3, 4), dtype) counts = np.zeros(len(out), dtype=np.int64) labels = com._ensure_int64( np.repeat(np.arange(3), np.diff(np.r_[0, bins]))) func = getattr(algos, 'group_ohlc_%s' % dtype) func(out, counts, obj[:, None], labels) def _ohlc(group): if isnull(group).all(): return np.repeat(nan, 4) return [group[0], group.max(), group.min(), group[-1]] expected = np.array( [_ohlc(obj[:6]), _ohlc(obj[6:12]), _ohlc(obj[12:])]) assert_almost_equal(out, expected) tm.assert_numpy_array_equal(counts, np.array([6, 6, 8], dtype=np.int64)) obj[:6] = nan func(out, counts, obj[:, None], labels) expected[0] = nan assert_almost_equal(out, expected)
def value_counts(values, sort=True, ascending=False): """ Compute a histogram of the counts of non-null values Returns ------- value_counts : Series """ from collections import defaultdict if com.is_integer_dtype(values.dtype): values = com._ensure_int64(values) keys, counts = lib.value_count_int64(values) result = Series(counts, index=keys) else: counter = defaultdict(lambda: 0) values = values[com.notnull(values)] for value in values: counter[value] += 1 result = Series(counter) if sort: result.sort() if not ascending: result = result[::-1] return result
def delete(self, loc): """ Make a new DatetimeIndex with passed location(s) deleted. Parameters ---------- loc: int, slice or array of ints Indicate which sub-arrays to remove. Returns ------- new_index : TimedeltaIndex """ new_tds = np.delete(self.asi8, loc) freq = 'infer' if is_integer(loc): if loc in (0, -len(self), -1, len(self) - 1): freq = self.freq else: if com.is_list_like(loc): loc = lib.maybe_indices_to_slice( com._ensure_int64(np.array(loc)), len(self)) if isinstance(loc, slice) and loc.step in (1, None): if (loc.start in (0, None) or loc.stop in (len(self), None)): freq = self.freq return TimedeltaIndex(new_tds, name=self.name, freq=freq)
def delete(self, loc): """ Make a new DatetimeIndex with passed location(s) deleted. Parameters ---------- loc: int, slice or array of ints Indicate which sub-arrays to remove. Returns ------- new_index : TimedeltaIndex """ new_tds = np.delete(self.asi8, loc) freq = 'infer' if lib.is_integer(loc): if loc in (0, -len(self), -1, len(self) - 1): freq = self.freq else: if com.is_list_like(loc): loc = lib.maybe_indices_to_slice( com._ensure_int64(np.array(loc))) if isinstance(loc, slice) and loc.step in (1, None): if (loc.start in (0, None) or loc.stop in (len(self), None)): freq = self.freq return TimedeltaIndex(new_tds, name=self.name, freq=freq)
def _check(dtype): obj = np.array(np.random.randn(20), dtype=dtype) bins = np.array([6, 12, 20]) out = np.zeros((3, 4), dtype) counts = np.zeros(len(out), dtype=np.int64) labels = com._ensure_int64(np.repeat( np.arange(3), np.diff(np.r_[0, bins]))) func = getattr(algos, 'group_ohlc_%s' % dtype) func(out, counts, obj[:, None], labels) def _ohlc(group): if isnull(group).all(): return np.repeat(nan, 4) return [group[0], group.max(), group.min(), group[-1]] expected = np.array([_ohlc(obj[:6]), _ohlc(obj[6:12]), _ohlc(obj[12:]) ]) assert_almost_equal(out, expected) assert_almost_equal(counts, [6, 6, 8]) obj[:6] = nan func(out, counts, obj[:, None], labels) expected[0] = nan assert_almost_equal(out, expected)
def get_result(self): # TODO: find a better way than this masking business values, value_mask = self.get_new_values() columns = self.get_new_columns() index = self.get_new_index() # filter out missing levels if values.shape[1] > 0: col_inds, obs_ids = _compress_group_index(self.sorted_labels[-1]) # rare case, level values not observed if len(obs_ids) < self.full_shape[1]: inds = (value_mask.sum(0) > 0).nonzero()[0] values = com.take_nd(values, inds, axis=1) columns = columns[inds] # we might have a missing index if len(index) != values.shape[0]: mask = isnull(index) if mask.any(): l = np.arange(len(index)) values, orig_values = np.empty((len(index),values.shape[1])), values values.fill(np.nan) values_indexer = com._ensure_int64(l[~mask]) for i, j in enumerate(values_indexer): values[j] = orig_values[i] else: index = index.take(self.unique_groups) return DataFrame(values, index=index, columns=columns)
def value_counts(values, sort=True, ascending=False): """ Compute a histogram of the counts of non-null values Parameters ---------- values : ndarray (1-d) sort : boolean, default True Sort by values ascending : boolean, default False Sort in ascending order Returns ------- value_counts : Series """ from pandas.core.series import Series from collections import defaultdict if com.is_integer_dtype(values.dtype): values = com._ensure_int64(values) keys, counts = lib.value_count_int64(values) result = Series(counts, index=keys) else: counter = defaultdict(lambda: 0) values = values[com.notnull(values)] for value in values: counter[value] += 1 result = Series(counter) if sort: result.sort() if not ascending: result = result[::-1] return result
def take(self, indices, axis=0): """ Analogous to ndarray.take """ maybe_slice = lib.maybe_indices_to_slice(com._ensure_int64(indices)) if isinstance(maybe_slice, slice): return self[maybe_slice] return super(DatetimeIndexOpsMixin, self).take(indices, axis)
def func(arr, indexer, out, fill_value=np.nan): indexer = com._ensure_int64(indexer) _take_nd_generic(arr, indexer, out, axis=axis, fill_value=fill_value, mask_info=mask_info)
def _get_multiindex_indexer(join_keys, index, sort=False): shape = [] labels = [] for level, key in zip(index.levels, join_keys): llab, rlab, count = _factorize_keys(level, key, sort=False) labels.append(rlab) shape.append(count) left_group_key = get_group_index(labels, shape) right_group_key = get_group_index(index.labels, shape) left_group_key, right_group_key, max_groups = _factorize_keys(left_group_key, right_group_key, sort=False) left_indexer, right_indexer = algos.left_outer_join( com._ensure_int64(left_group_key), com._ensure_int64(right_group_key), max_groups, sort=False ) return left_indexer, right_indexer
def unique1d(values): """ Hash table-based unique """ if np.issubdtype(values.dtype, np.floating): table = _hash.Float64HashTable(len(values)) uniques = np.array(table.unique(com._ensure_float64(values)), dtype=np.float64) elif np.issubdtype(values.dtype, np.datetime64): table = _hash.Int64HashTable(len(values)) uniques = table.unique(com._ensure_int64(values)) uniques = uniques.view("M8[ns]") elif np.issubdtype(values.dtype, np.integer): table = _hash.Int64HashTable(len(values)) uniques = table.unique(com._ensure_int64(values)) else: table = _hash.PyObjectHashTable(len(values)) uniques = table.unique(com._ensure_object(values)) return uniques
def take(self, indices, axis=0): """ Analogous to ndarray.take """ maybe_slice = lib.maybe_indices_to_slice(com._ensure_int64(indices)) if isinstance(maybe_slice, slice): return self[maybe_slice] taken = self.values.take(indices, axis=axis) return DatetimeIndex(taken, tz=self.tz, name=self.name)
def take(self, indices, axis=0): """ Analogous to ndarray.take """ maybe_slice = lib.maybe_indices_to_slice(com._ensure_int64(indices)) if isinstance(maybe_slice, slice): return self[maybe_slice] indices = com._ensure_platform_int(indices) taken = self.values.take(indices, axis=axis) return self._simple_new(taken, self.name, None, self.tz)
def take(self, indices, axis=0): """ Analogous to ndarray.take """ maybe_slice = lib.maybe_indices_to_slice(com._ensure_int64(indices)) if isinstance(maybe_slice, slice): return self[maybe_slice] indices = com._ensure_platform_int(indices) taken = self.values.take(indices, axis=axis) return DatetimeIndex(taken, tz=self.tz, name=self.name)
def _simple_new(cls, values, name, freq=None, tz=None): if values.dtype != _NS_DTYPE: values = com._ensure_int64(values).view(_NS_DTYPE) result = values.view(cls) result.name = name result.offset = freq result.tz = tools._maybe_get_tz(tz) return result
def take(self, indices, axis=0): """ Analogous to ndarray.take """ indices = com._ensure_int64(indices) maybe_slice = lib.maybe_indices_to_slice(indices, len(self)) if isinstance(maybe_slice, slice): return self[maybe_slice] taken = self.asi8.take(com._ensure_platform_int(indices)) return self._shallow_copy(taken, freq=None)
def unique1d(values): """ Hash table-based unique """ if np.issubdtype(values.dtype, np.floating): table = _hash.Float64HashTable(len(values)) uniques = np.array(table.unique(com._ensure_float64(values)), dtype=np.float64) elif np.issubdtype(values.dtype, np.datetime64): table = _hash.Int64HashTable(len(values)) uniques = table.unique(com._ensure_int64(values)) uniques = uniques.view('M8[ns]') elif np.issubdtype(values.dtype, np.integer): table = _hash.Int64HashTable(len(values)) uniques = table.unique(com._ensure_int64(values)) else: table = _hash.PyObjectHashTable(len(values)) uniques = table.unique(com._ensure_object(values)) return uniques
def _make_index_array_level(lev, lab): """ create the combined index array, preserving nans, return an array """ mask = lab == -1 if not mask.any(): return lev l = np.arange(len(lab)) mask_labels = np.empty(len(mask[mask]), dtype=object) mask_labels.fill(np.nan) mask_indexer = com._ensure_int64(l[mask]) labels = lev labels_indexer = com._ensure_int64(l[~mask]) new_labels = np.empty(tuple([len(lab)]), dtype=object) new_labels[labels_indexer] = labels new_labels[mask_indexer] = mask_labels return new_labels
def _make_index_array_level(lev,lab): """ create the combined index array, preserving nans, return an array """ mask = lab == -1 if not mask.any(): return lev l = np.arange(len(lab)) mask_labels = np.empty(len(mask[mask]),dtype=object) mask_labels.fill(np.nan) mask_indexer = com._ensure_int64(l[mask]) labels = lev labels_indexer = com._ensure_int64(l[~mask]) new_labels = np.empty(tuple([len(lab)]),dtype=object) new_labels[labels_indexer] = labels new_labels[mask_indexer] = mask_labels return new_labels
def take(self, indices, axis=0, **kwargs): """ Analogous to ndarray.take """ indices = com._ensure_int64(indices) maybe_slice = lib.maybe_indices_to_slice(indices, len(self)) if isinstance(maybe_slice, slice): return self[maybe_slice] taken = self.asi8.take(com._ensure_platform_int(indices)) return self._shallow_copy(taken, freq=None)
def _value_counts_arraylike(values, dropna=True): is_datetimetz = com.is_datetimetz(values) is_period = (isinstance(values, gt.ABCPeriodIndex) or com.is_period_arraylike(values)) orig = values from pandas.core.series import Series values = Series(values).values dtype = values.dtype if com.is_datetime_or_timedelta_dtype(dtype) or is_period: from pandas.tseries.index import DatetimeIndex from pandas.tseries.period import PeriodIndex if is_period: values = PeriodIndex(values) freq = values.freq values = values.view(np.int64) keys, counts = htable.value_count_scalar64(values, dropna) if dropna: msk = keys != iNaT keys, counts = keys[msk], counts[msk] # convert the keys back to the dtype we came in keys = keys.astype(dtype) # dtype handling if is_datetimetz: if isinstance(orig, gt.ABCDatetimeIndex): tz = orig.tz else: tz = orig.dt.tz keys = DatetimeIndex._simple_new(keys, tz=tz) if is_period: keys = PeriodIndex._simple_new(keys, freq=freq) elif com.is_integer_dtype(dtype): values = com._ensure_int64(values) keys, counts = htable.value_count_scalar64(values, dropna) elif com.is_float_dtype(dtype): values = com._ensure_float64(values) keys, counts = htable.value_count_scalar64(values, dropna) else: values = com._ensure_object(values) mask = com.isnull(values) keys, counts = htable.value_count_object(values, mask) if not dropna and mask.any(): keys = np.insert(keys, 0, np.NaN) counts = np.insert(counts, 0, mask.sum()) return keys, counts
def _get_data_algo(values, func_map): if com.is_float_dtype(values): f = func_map['float64'] values = com._ensure_float64(values) elif com.is_integer_dtype(values): f = func_map['int64'] values = com._ensure_int64(values) else: f = func_map['generic'] values = com._ensure_object(values) return f, values
def _get_hash_table_and_cast(values): if com.is_float_dtype(values): klass = lib.Float64HashTable values = com._ensure_float64(values) elif com.is_integer_dtype(values): klass = lib.Int64HashTable values = com._ensure_int64(values) else: klass = lib.PyObjectHashTable values = com._ensure_object(values) return klass, values
def _get_multiindex_indexer(join_keys, index, sort=False): shape = [] labels = [] for level, key in zip(index.levels, join_keys): llab, rlab, count = _factorize_keys(level, key, sort=False) labels.append(rlab) shape.append(count) left_group_key = get_group_index(labels, shape) right_group_key = get_group_index(index.labels, shape) left_group_key, right_group_key, max_groups = \ _factorize_keys(left_group_key, right_group_key, sort=False) left_indexer, right_indexer = \ algos.left_outer_join(com._ensure_int64(left_group_key), com._ensure_int64(right_group_key), max_groups, sort=False) return left_indexer, right_indexer
def _factorize_keys(lk, rk, sort=True): if com.is_datetime64tz_dtype(lk) and com.is_datetime64tz_dtype(rk): lk = lk.values rk = rk.values if com.is_int_or_datetime_dtype(lk) and com.is_int_or_datetime_dtype(rk): klass = _hash.Int64Factorizer lk = com._ensure_int64(com._values_from_object(lk)) rk = com._ensure_int64(com._values_from_object(rk)) else: klass = _hash.Factorizer lk = com._ensure_object(lk) rk = com._ensure_object(rk) rizer = klass(max(len(lk), len(rk))) llab = rizer.factorize(lk) rlab = rizer.factorize(rk) count = rizer.get_count() if sort: uniques = rizer.uniques.to_array() llab, rlab = _sort_labels(uniques, llab, rlab) # NA group lmask = llab == -1 lany = lmask.any() rmask = rlab == -1 rany = rmask.any() if lany or rany: if lany: np.putmask(llab, lmask, count) if rany: np.putmask(rlab, rmask, count) count += 1 return llab, rlab, count
def _simple_new(cls, values, name=None, freq=None, **kwargs): if not getattr(values, 'dtype', None): values = np.array(values, copy=False) if values.dtype == np.object_: values = tslib.array_to_timedelta64(values) if values.dtype != _TD_DTYPE: values = com._ensure_int64(values).view(_TD_DTYPE) result = object.__new__(cls) result._data = values result.name = name result.freq = freq result._reset_identity() return result
def _factorize_keys(lk, rk, sort=True): if com.is_integer_dtype(lk) and com.is_integer_dtype(rk): klass = lib.Int64Factorizer lk = com._ensure_int64(lk) rk = com._ensure_int64(rk) else: klass = lib.Factorizer lk = com._ensure_object(lk) rk = com._ensure_object(rk) rizer = klass(max(len(lk), len(rk))) llab, _ = rizer.factorize(lk) rlab, _ = rizer.factorize(rk) count = rizer.get_count() if sort: llab, rlab = _sort_labels(rizer.uniques, llab, rlab) # TODO: na handling return llab, rlab, count
def _get_data_algo(values, func_map): if com.is_float_dtype(values): f = func_map["float64"] values = com._ensure_float64(values) elif com.is_datetime64_dtype(values): f = func_map["int64"] values = values.view("i8") elif com.is_integer_dtype(values): f = func_map["int64"] values = com._ensure_int64(values) else: f = func_map["generic"] values = com._ensure_object(values) return f, values
def value_counts(values, sort=True, ascending=False, normalize=False): """ Compute a histogram of the counts of non-null values Parameters ---------- values : ndarray (1-d) sort : boolean, default True Sort by values ascending : boolean, default False Sort in ascending order normalize: boolean, default False If True then compute a relative histogram Returns ------- value_counts : Series """ from pandas.core.series import Series values = np.asarray(values) if com.is_integer_dtype(values.dtype): values = com._ensure_int64(values) keys, counts = htable.value_count_int64(values) elif issubclass(values.dtype.type, (np.datetime64,np.timedelta64)): dtype = values.dtype values = values.view(np.int64) keys, counts = htable.value_count_int64(values) # convert the keys back to the dtype we came in keys = Series(keys,dtype=dtype) else: mask = com.isnull(values) values = com._ensure_object(values) keys, counts = htable.value_count_object(values, mask) result = Series(counts, index=keys) if sort: result.sort() if not ascending: result = result[::-1] if normalize: result = result / float(values.size) return result
def take(self, indices, axis=0, allow_fill=True, fill_value=None): indices = com._ensure_int64(indices) maybe_slice = lib.maybe_indices_to_slice(indices, len(self)) if isinstance(maybe_slice, slice): return self[maybe_slice] taken = self._assert_take_fillable(self.asi8, indices, allow_fill=allow_fill, fill_value=fill_value, na_value=tslib.iNaT) # keep freq in PeriodIndex, reset otherwise freq = self.freq if isinstance(self, com.ABCPeriodIndex) else None return self._shallow_copy(taken, freq=freq)
def _get_data_algo(values, func_map): if com.is_float_dtype(values): f = func_map['float64'] values = com._ensure_float64(values) elif com.needs_i8_conversion(values): f = func_map['int64'] values = values.view('i8') elif com.is_integer_dtype(values): f = func_map['int64'] values = com._ensure_int64(values) else: f = func_map['generic'] values = com._ensure_object(values) return f, values
def take(self, indices, axis=0, allow_fill=True, fill_value=None): """ Analogous to ndarray.take """ indices = com._ensure_int64(indices) maybe_slice = lib.maybe_indices_to_slice(indices, len(self)) if isinstance(maybe_slice, slice): return self[maybe_slice] taken = self.asi8.take(com._ensure_platform_int(indices)) # only fill if we are passing a non-None fill_value if allow_fill and fill_value is not None: mask = indices == -1 if mask.any(): taken[mask] = tslib.iNaT return self._shallow_copy(taken, freq=None)
def mode(self): """ Returns the mode(s) of the Categorical. Empty if nothing occurs at least 2 times. Always returns `Categorical` even if only one value. Returns ------- modes : `Categorical` (sorted) """ import pandas.hashtable as htable good = self._codes != -1 result = Categorical(sorted(htable.mode_int64(com._ensure_int64(self._codes[good]))), categories=self.categories,ordered=self.ordered, name=self.name, fastpath=True) return result
def get_group_index(label_list, shape): """ For the particular label_list, gets the offsets into the hypothetical list representing the totally ordered cartesian product of all possible label combinations. """ if len(label_list) == 1: return label_list[0] n = len(label_list[0]) group_index = np.zeros(n, dtype=np.int64) mask = np.zeros(n, dtype=bool) for i in xrange(len(shape)): stride = np.prod([x for x in shape[i+1:]], dtype=np.int64) group_index += com._ensure_int64(label_list[i]) * stride mask |= label_list[i] < 0 np.putmask(group_index, mask, -1) return group_index
def value_counts(values, sort=True, ascending=False): """ Compute a histogram of the counts of non-null values Parameters ---------- values : ndarray (1-d) sort : boolean, default True Sort by values ascending : boolean, default False Sort in ascending order Returns ------- value_counts : Series """ from pandas.core.series import Series from collections import defaultdict values = np.asarray(values) if com.is_integer_dtype(values.dtype): values = com._ensure_int64(values) keys, counts = lib.value_count_int64(values) result = Series(counts, index=keys) else: counter = defaultdict(lambda: 0) values = values[com.notnull(values)] for value in values: counter[value] += 1 result = Series(counter) if sort: result.sort() if not ascending: result = result[::-1] return result
def _compress_group_index(group_index, sort=True): """ Group_index is offsets into cartesian product of all possible labels. This space can be huge, so this function compresses it, by computing offsets (comp_ids) into the list of unique labels (obs_group_ids). """ uniques = [] table = lib.Int64HashTable(len(group_index)) group_index = com._ensure_int64(group_index) # note, group labels come out ascending (ie, 1,2,3 etc) comp_ids = table.get_labels_groupby(group_index, uniques) # these are the unique ones we observed, in the order we observed them obs_group_ids = np.array(uniques, dtype='i8') if sort and len(obs_group_ids) > 0: # sorter is index where elements ought to go sorter = obs_group_ids.argsort() # reverse_indexer is where elements came from reverse_indexer = np.empty(len(sorter), dtype='i4') reverse_indexer.put(sorter, np.arange(len(sorter))) mask = comp_ids < 0 # move comp_ids to right locations (ie, unsort ascending labels) comp_ids = reverse_indexer.take(comp_ids) np.putmask(comp_ids, mask, -1) # sort observed ids obs_group_ids = obs_group_ids.take(sorter) return comp_ids, obs_group_ids
def value_counts(values, sort=True, ascending=False): """ Compute a histogram of the counts of non-null values Parameters ---------- values : ndarray (1-d) sort : boolean, default True Sort by values ascending : boolean, default False Sort in ascending order Returns ------- value_counts : Series """ from pandas.core.series import Series values = np.asarray(values) if com.is_integer_dtype(values.dtype): values = com._ensure_int64(values) keys, counts = htable.value_count_int64(values) else: mask = com.isnull(values) values = com._ensure_object(values) keys, counts = htable.value_count_object(values, mask) result = Series(counts, index=keys) if sort: result.sort() if not ascending: result = result[::-1] return result
def value_counts(values, sort=True, ascending=False, normalize=False, bins=None, dropna=True): """ Compute a histogram of the counts of non-null values. Parameters ---------- values : ndarray (1-d) sort : boolean, default True Sort by values ascending : boolean, default False Sort in ascending order normalize: boolean, default False If True then compute a relative histogram bins : integer, optional Rather than count values, group them into half-open bins, convenience for pd.cut, only works with numeric data dropna : boolean, default True Don't include counts of NaN Returns ------- value_counts : Series """ from pandas.core.series import Series from pandas.tools.tile import cut from pandas import Index, PeriodIndex, DatetimeIndex name = getattr(values, 'name', None) values = Series(values).values if bins is not None: try: cat, bins = cut(values, bins, retbins=True) except TypeError: raise TypeError("bins argument only works with numeric data.") values = cat.codes if com.is_categorical_dtype(values.dtype): result = values.value_counts(dropna) else: dtype = values.dtype is_period = com.is_period_arraylike(values) is_datetimetz = com.is_datetimetz(values) if com.is_datetime_or_timedelta_dtype( dtype) or is_period or is_datetimetz: if is_period: values = PeriodIndex(values) elif is_datetimetz: tz = getattr(values, 'tz', None) values = DatetimeIndex(values).tz_localize(None) values = values.view(np.int64) keys, counts = htable.value_count_scalar64(values, dropna) if dropna: from pandas.tslib import iNaT msk = keys != iNaT keys, counts = keys[msk], counts[msk] # localize to the original tz if necessary if is_datetimetz: keys = DatetimeIndex(keys).tz_localize(tz) # convert the keys back to the dtype we came in else: keys = keys.astype(dtype) elif com.is_integer_dtype(dtype): values = com._ensure_int64(values) keys, counts = htable.value_count_scalar64(values, dropna) elif com.is_float_dtype(dtype): values = com._ensure_float64(values) keys, counts = htable.value_count_scalar64(values, dropna) else: values = com._ensure_object(values) mask = com.isnull(values) keys, counts = htable.value_count_object(values, mask) if not dropna and mask.any(): keys = np.insert(keys, 0, np.NaN) counts = np.insert(counts, 0, mask.sum()) if not isinstance(keys, Index): keys = Index(keys) result = Series(counts, index=keys, name=name) if bins is not None: # TODO: This next line should be more efficient result = result.reindex(np.arange(len(cat.categories)), fill_value=0) result.index = bins[:-1] if sort: result = result.sort_values(ascending=ascending) if normalize: result = result / float(values.size) return result