def factorize(values, sort=False, order=None, na_sentinel=-1): """ Encode input values as an enumerated type or categorical variable Parameters ---------- values : ndarray (1-d) Sequence sort : boolean, default False Sort by values order : na_sentinel: int, default -1 Value to mark "not found" Returns ------- """ from pandas.tseries.period import PeriodIndex vals = np.asarray(values) is_datetime = com.is_datetime64_dtype(vals) (hash_klass, vec_klass), vals = _get_data_algo(vals, _hashtables) table = hash_klass(len(vals)) uniques = vec_klass() labels = table.get_labels(vals, uniques, 0, na_sentinel) labels = com._ensure_platform_int(labels) uniques = uniques.to_array() if sort and len(uniques) > 0: try: sorter = uniques.argsort() except: # unorderable in py3 if mixed str/int t = hash_klass(len(uniques)) t.map_locations(com._ensure_object(uniques)) # order ints before strings ordered = np.concatenate([ np.sort(np.array([ e for i, e in enumerate(uniques) if f(e) ],dtype=object)) for f in [ lambda x: not isinstance(x,string_types), lambda x: isinstance(x,string_types) ] ]) sorter = com._ensure_platform_int(t.lookup(com._ensure_object(ordered))) reverse_indexer = np.empty(len(sorter), dtype=np.int_) reverse_indexer.put(sorter, np.arange(len(sorter))) mask = labels < 0 labels = reverse_indexer.take(labels) np.putmask(labels, mask, -1) uniques = uniques.take(sorter) if is_datetime: uniques = uniques.astype('M8[ns]') if isinstance(values, PeriodIndex): uniques = PeriodIndex(ordinal=uniques, freq=values.freq) return labels, uniques
def values_at_time(obj, time, tz=None, asof=False): """ Select values at particular time of day (e.g. 9:30AM) Parameters ---------- time : datetime.time or string tz : string or pytz.timezone Time zone for time. Corresponding timestamps would be converted to time zone of the TimeSeries Returns ------- values_at_time : TimeSeries """ from dateutil.parser import parse if asof: raise NotImplementedError if tz: raise NotImplementedError if not isinstance(obj.index, DatetimeIndex): raise NotImplementedError if isinstance(time, basestring): time = parse(time).time() # TODO: time object with tzinfo? mus = _time_to_nanosecond(time) indexer = lib.values_at_time(obj.index.asi8, mus) indexer = com._ensure_platform_int(indexer) return obj.take(indexer)
def remove_unused_categories(self, inplace=False): """ Removes categories which are not used. Parameters ---------- inplace : boolean (default: False) Whether or not to drop unused categories inplace or return a copy of this categorical with unused categories dropped. Returns ------- cat : Categorical with unused categories dropped or None if inplace. See also -------- rename_categories reorder_categories add_categories remove_categories set_categories """ cat = self if inplace else self.copy() _used = sorted(np.unique(cat._codes)) new_categories = cat.categories.take(com._ensure_platform_int(_used)) new_categories = _ensure_index(new_categories) cat._codes = _get_codes_for_values(cat.__array__(), new_categories) cat._categories = new_categories if not inplace: return cat
def factorize(values, sort=False, order=None, na_sentinel=-1): """ Encode input values as an enumerated type or categorical variable Parameters ---------- values : sequence sort : order : Returns ------- """ hash_klass, values = _get_data_algo(values, _hashtables) uniques = [] table = hash_klass(len(values)) labels, counts = table.get_labels(values, uniques, 0, na_sentinel) labels = com._ensure_platform_int(labels) uniques = com._asarray_tuplesafe(uniques) if sort and len(counts) > 0: sorter = uniques.argsort() reverse_indexer = np.empty(len(sorter), dtype=np.int_) reverse_indexer.put(sorter, np.arange(len(sorter))) mask = labels < 0 labels = reverse_indexer.take(labels) np.putmask(labels, mask, -1) uniques = uniques.take(sorter) counts = counts.take(sorter) return labels, uniques, counts
def _make_selectors(self): new_levels = self.new_index_levels # make the mask group_index = get_group_index(self.sorted_labels[:-1], [len(x) for x in new_levels]) group_index = _ensure_platform_int(group_index) group_mask = np.zeros(self.full_shape[0], dtype=bool) group_mask.put(group_index, True) stride = self.index.levshape[self.level] selector = self.sorted_labels[-1] + stride * group_index mask = np.zeros(np.prod(self.full_shape), dtype=bool) mask.put(selector, True) # compress labels unique_groups = np.arange(self.full_shape[0])[group_mask] compressor = group_index.searchsorted(unique_groups) if mask.sum() < len(self.index): raise ReshapeError("Index contains duplicate entries, " "cannot reshape") self.group_mask = group_mask self.group_index = group_index self.mask = mask self.unique_groups = unique_groups self.compressor = compressor
def _make_selectors(self): new_levels = self.new_index_levels # make the mask remaining_labels = self.sorted_labels[:-1] level_sizes = [len(x) for x in new_levels] comp_index, obs_ids = get_compressed_ids(remaining_labels, level_sizes) ngroups = len(obs_ids) comp_index = _ensure_platform_int(comp_index) stride = self.index.levshape[self.level] self.full_shape = ngroups, stride selector = self.sorted_labels[-1] + stride * comp_index mask = np.zeros(np.prod(self.full_shape), dtype=bool) mask.put(selector, True) if mask.sum() < len(self.index): raise ReshapeError('Index contains duplicate entries, ' 'cannot reshape') self.group_index = comp_index self.mask = mask self.unique_groups = obs_ids self.compressor = comp_index.searchsorted(np.arange(ngroups))
def take(self, indices, axis=0, allow_fill=True, fill_value=None): """ Analogous to ndarray.take """ indices = com._ensure_platform_int(indices) taken = self.asi8.take(indices, axis=axis) return self._simple_new(taken, self.name, freq=self.freq)
def take(self, indices, axis=0, allow_fill=True, fill_value=None): indices = com._ensure_platform_int(indices) taken = self._assert_take_fillable(self.codes, indices, allow_fill=allow_fill, fill_value=fill_value, na_value=-1) return self._create_from_codes(taken)
def factor_indexer(shape, labels): """ given a tuple of shape and a list of Categorical labels, return the expanded label indexer """ mult = np.array(shape)[::-1].cumprod()[::-1] return com._ensure_platform_int( np.sum(np.array(labels).T * np.append(mult, [1]), axis=1).T)
def ref_locs(self): if self._ref_locs is None: indexer = self.ref_items.get_indexer(self.items) indexer = com._ensure_platform_int(indexer) assert ((indexer != -1).all()) self._ref_locs = indexer return self._ref_locs
def ref_locs(self): if self._ref_locs is None: indexer = self.ref_items.get_indexer(self.items) indexer = com._ensure_platform_int(indexer) assert((indexer != -1).all()) self._ref_locs = indexer return self._ref_locs
def _reindex_index(self, index, method, copy, level, fill_value=np.nan, limit=None): if level is not None: raise Exception('Reindex by level not supported for sparse') if self.index.equals(index): if copy: return self.copy() else: return self if len(self.index) == 0: return SparseDataFrame(index=index, columns=self.columns) indexer = self.index.get_indexer(index, method, limit=limit) indexer = com._ensure_platform_int(indexer) mask = indexer == -1 need_mask = mask.any() new_series = {} for col, series in self.iteritems(): values = series.values new = values.take(indexer) if need_mask: np.putmask(new, mask, fill_value) new_series[col] = new return SparseDataFrame(new_series, index=index, columns=self.columns, default_fill_value=self.default_fill_value)
def take(self, indices, axis=0, convert=True): """ Analogous to ndarray.take, return SparseDataFrame corresponding to requested indices along an axis Parameters ---------- indices : list / array of ints axis : {0, 1} convert : convert indices for negative values, check bounds, default True mainly useful for an user routine calling Returns ------- taken : SparseDataFrame """ indices = com._ensure_platform_int(indices) # check/convert indicies here if convert: indices = _maybe_convert_indices(indices, len(self._get_axis(axis))) new_values = self.values.take(indices, axis=axis) if axis == 0: new_columns = self.columns new_index = self.index.take(indices) else: new_columns = self.columns.take(indices) new_index = self.index return self._constructor(new_values, index=new_index, columns=new_columns)
def take(self, indices, axis=None): """ Analogous to ndarray.take """ indices = com._ensure_platform_int(indices) taken = self.values.take(indices, axis=axis) return self._simple_new(taken, self.name, freq=self.freq)
def take(self, indices, axis=0): """ Analogous to ndarray.take, return SparseDataFrame corresponding to requested indices along an axis Parameters ---------- indices : list / array of ints axis : {0, 1} Returns ------- taken : SparseDataFrame """ indices = com._ensure_platform_int(indices) new_values = self.values.take(indices, axis=axis) if axis == 0: new_columns = self.columns new_index = self.index.take(indices) else: new_columns = self.columns.take(indices) new_index = self.index return self._constructor(new_values, index=new_index, columns=new_columns)
def _make_selectors(self): new_levels = self.new_index_levels # make the mask group_index = get_group_index(self.sorted_labels[:-1], [len(x) for x in new_levels]) group_index = _ensure_platform_int(group_index) group_mask = np.zeros(self.full_shape[0], dtype=bool) group_mask.put(group_index, True) stride = self.index.levshape[self.level] selector = self.sorted_labels[-1] + stride * group_index mask = np.zeros(np.prod(self.full_shape), dtype=bool) mask.put(selector, True) # compress labels unique_groups = np.arange(self.full_shape[0])[group_mask] compressor = group_index.searchsorted(unique_groups) if mask.sum() < len(self.index): raise ReshapeError('Index contains duplicate entries, ' 'cannot reshape') self.group_mask = group_mask self.group_index = group_index self.mask = mask self.unique_groups = unique_groups self.compressor = compressor
def _reindex_index(self, index, method, copy, level, fill_value=np.nan, limit=None, takeable=False): if level is not None: raise TypeError('Reindex by level not supported for sparse') if self.index.equals(index): if copy: return self.copy() else: return self if len(self.index) == 0: return SparseDataFrame(index=index, columns=self.columns) indexer = self.index.get_indexer(index, method, limit=limit) indexer = com._ensure_platform_int(indexer) mask = indexer == -1 need_mask = mask.any() new_series = {} for col, series in compat.iteritems(self): values = series.values new = values.take(indexer) if need_mask: np.putmask(new, mask, fill_value) new_series[col] = new return SparseDataFrame(new_series, index=index, columns=self.columns, default_fill_value=self.default_fill_value)
def ref_locs(self): if self._ref_locs is None: indexer = self.ref_items.get_indexer(self.items) indexer = com._ensure_platform_int(indexer) if (indexer == -1).any(): raise AssertionError("Some block items were not in block " "ref_items") self._ref_locs = indexer return self._ref_locs
def _indices_at_time(self, key): from dateutil.parser import parse # TODO: time object with tzinfo? nanos = _time_to_nanosecond(key) indexer = lib.values_at_time(self.asi8, nanos) return com._ensure_platform_int(indexer)
def _sort_labels(uniques, left, right): if not isinstance(uniques, np.ndarray): # tuplesafe uniques = Index(uniques).values sorter = uniques.argsort() reverse_indexer = np.empty(len(sorter), dtype=np.int64) reverse_indexer.put(sorter, np.arange(len(sorter))) new_left = reverse_indexer.take(com._ensure_platform_int(left)) np.putmask(new_left, left == -1, -1) new_right = reverse_indexer.take(com._ensure_platform_int(right)) np.putmask(new_right, right == -1, -1) return new_left, new_right
def _get_codes_for_values(values, levels): from pandas.core.algorithms import _get_data_algo, _hashtables if values.dtype != levels.dtype: values = com._ensure_object(values) levels = com._ensure_object(levels) (hash_klass, vec_klass), vals = _get_data_algo(values, _hashtables) t = hash_klass(len(levels)) t.map_locations(levels) return com._ensure_platform_int(t.lookup(values))
def ref_locs(self): if self._ref_locs is None: indexer = self.ref_items.get_indexer(self.items) indexer = com._ensure_platform_int(indexer) if (indexer == -1).any(): raise AssertionError('Some block items were not in block ' 'ref_items') self._ref_locs = indexer return self._ref_locs
def _get_group_levels(self, mask, obs_ids): recons_labels = decons_group_index(obs_ids, self._group_shape) name_list = [] for ping, labels in zip(self.groupings, recons_labels): labels = com._ensure_platform_int(labels) name_list.append((ping.name, ping.group_index.take(labels))) return name_list
def take(self, indices, axis=0, allow_fill=True, fill_value=None, **kwargs): """ Sparse-compatible version of ndarray.take Returns ------- taken : ndarray """ nv.validate_take(tuple(), kwargs) if axis: raise ValueError("axis must be 0, input was {0}".format(axis)) if com.is_integer(indices): # return scalar return self[indices] indices = com._ensure_platform_int(indices) n = len(self) if allow_fill and fill_value is not None: # allow -1 to indicate self.fill_value, # self.fill_value may not be NaN if (indices < -1).any(): msg = ('When allow_fill=True and fill_value is not None, ' 'all indices must be >= -1') raise ValueError(msg) elif (n <= indices).any(): msg = 'index is out of bounds for size {0}' raise IndexError(msg.format(n)) else: if ((indices < -n) | (n <= indices)).any(): msg = 'index is out of bounds for size {0}' raise IndexError(msg.format(n)) indices = indices.astype(np.int32) if not (allow_fill and fill_value is not None): indices = indices.copy() indices[indices < 0] += n locs = self.sp_index.lookup_array(indices) indexer = np.arange(len(locs), dtype=np.int32) mask = locs != -1 if mask.any(): indexer = indexer[mask] new_values = self.sp_values.take(locs[mask]) else: indexer = np.empty(shape=(0, ), dtype=np.int32) new_values = np.empty(shape=(0, ), dtype=self.sp_values.dtype) sp_index = _make_index(len(indices), indexer, kind=self.sp_index) return self._simple_new(new_values, sp_index, self.fill_value)
def take(self, indices, axis=0): """ Analogous to ndarray.take """ maybe_slice = lib.maybe_indices_to_slice(com._ensure_int64(indices)) if isinstance(maybe_slice, slice): return self[maybe_slice] indices = com._ensure_platform_int(indices) taken = self.values.take(indices, axis=axis) return DatetimeIndex(taken, tz=self.tz, name=self.name)
def remove_unused_levels(self): """ Removes levels which are not used. The level removal is done inplace. """ _used = sorted(np.unique(self._codes)) new_levels = self.levels.take(com._ensure_platform_int(_used)) new_levels = _ensure_index(new_levels) self._codes = _get_codes_for_values(self.__array__(), new_levels) self._levels = new_levels
def take(self, indices, axis=0): """ Analogous to ndarray.take """ maybe_slice = lib.maybe_indices_to_slice(com._ensure_int64(indices)) if isinstance(maybe_slice, slice): return self[maybe_slice] indices = com._ensure_platform_int(indices) taken = self.values.take(indices, axis=axis) return self._simple_new(taken, self.name, None, self.tz)
def test_ensure_platform_int(): # verify that when we create certain types of indices # they remain the correct type under platform conversions from pandas.core.index import Int64Index # int64 x = Int64Index([1, 2, 3], dtype='int64') assert (x.dtype == np.int64) pi = com._ensure_platform_int(x) assert (pi.dtype == np.int_) # int32 x = Int64Index([1, 2, 3], dtype='int32') assert (x.dtype == np.int32) pi = com._ensure_platform_int(x) assert (pi.dtype == np.int_)
def take(self, indices, axis=0, **kwargs): """ Analogous to ndarray.take """ indices = com._ensure_int64(indices) maybe_slice = lib.maybe_indices_to_slice(indices, len(self)) if isinstance(maybe_slice, slice): return self[maybe_slice] taken = self.asi8.take(com._ensure_platform_int(indices)) return self._shallow_copy(taken, freq=None)
def get_group_levels(self): obs_ids = self.group_info[1] recons_labels = decons_group_index(obs_ids, self.shape) name_list = [] for ping, labels in zip(self.groupings, recons_labels): labels = com._ensure_platform_int(labels) name_list.append(ping.group_index.take(labels)) return name_list
def take(self, indices, axis=None): """ Analogous to ndarray.take """ indices = com._ensure_platform_int(indices) taken = self.values.take(indices, axis=axis) taken = taken.view(PeriodIndex) taken.freq = self.freq taken.name = self.name return taken
def take(self, indices, axis=0): """ Analogous to ndarray.take """ indices = com._ensure_int64(indices) maybe_slice = lib.maybe_indices_to_slice(indices, len(self)) if isinstance(maybe_slice, slice): return self[maybe_slice] taken = self.asi8.take(com._ensure_platform_int(indices)) return self._shallow_copy(taken, freq=None)
def test_ensure_platform_int(): # verify that when we create certain types of indices # they remain the correct type under platform conversions from pandas.core.index import Int64Index # int64 x = Int64Index([1, 2, 3], dtype='int64') assert(x.dtype == np.int64) pi = com._ensure_platform_int(x) assert(pi.dtype == np.int_) # int32 x = Int64Index([1, 2, 3], dtype='int32') assert(x.dtype == np.int32) pi = com._ensure_platform_int(x) assert(pi.dtype == np.int_)
def factorize(values, sort=False, order=None, na_sentinel=-1): """ Encode input values as an enumerated type or categorical variable Parameters ---------- values : array-like sort : boolean, default True Sort by values order :,optional na_sentinel : int, default -1 Value to mark "not found" Examples -------- >>> factorize([12,3,8,5,9,7,11],sort=True,order=None,na_sentinel=-1) (array([6, 0, 3, 1, 4, 2, 5]), array([ 3, 5, 7, 8, 9, 11, 12], dtype=int64)) >>> factorize([12,3,8,5,9,7,10],sort=False,order=None,na_sentinel=-1) (array([0, 1, 2, 3, 4, 5, 6]), array([12, 3, 8, 5, 9, 7, 10], dtype=int64)) >>> factorize([12,3,8,5,9,7,10,10],sort=False,order=None,na_sentinel=-1) (array([0, 1, 2, 3, 4, 5, 6, 6]), array([12, 3, 8, 5, 9, 7, 10], dtype=int64)) Returns ------- a tuple labels of each number in array form ,corresponding number without duplication """ from pandas.tseries.period import PeriodIndex vals = np.asarray(values) is_datetime = com.is_datetime64_dtype(vals) (hash_klass, vec_klass), vals = _get_data_algo(vals, _hashtables) table = hash_klass(len(vals)) uniques = vec_klass() labels = table.get_labels(vals, uniques, 0, na_sentinel) labels = com._ensure_platform_int(labels) uniques = uniques.to_array() if sort and len(uniques) > 0: sorter = uniques.argsort() reverse_indexer = np.empty(len(sorter), dtype=np.int_) reverse_indexer.put(sorter, np.arange(len(sorter))) mask = labels < 0 labels = reverse_indexer.take(labels) np.putmask(labels, mask, -1) uniques = uniques.take(sorter) if is_datetime: uniques = uniques.view('M8[ns]') if isinstance(values, PeriodIndex): uniques = PeriodIndex(ordinal=uniques, freq=values.freq) return labels, uniques
def describe(self): """ Describes this Categorical Returns ------- description: `DataFrame` A dataframe with frequency and counts by level. """ # Hack? from pandas.core.frame import DataFrame counts = DataFrame({ 'codes': self._codes, 'values': self._codes }).groupby('codes').count() freqs = counts / float(counts.sum()) from pandas.tools.merge import concat result = concat([counts, freqs], axis=1) result.columns = ['counts', 'freqs'] # fill in the real levels check = result.index == -1 if check.any(): # Sort -1 (=NaN) to the last position index = np.arange(0, len(self.levels) + 1, dtype='int64') index[-1] = -1 result = result.reindex(index) # build new index levels = np.arange(0, len(self.levels) + 1, dtype=object) levels[:-1] = self.levels levels[-1] = np.nan result.index = levels.take(com._ensure_platform_int(result.index)) else: result.index = self.levels.take( com._ensure_platform_int(result.index)) result = result.reindex(self.levels) result.index.name = 'levels' return result
def _get_codes_for_values(values, levels): """" utility routine to turn values into codes given the specified levels """ from pandas.core.algorithms import _get_data_algo, _hashtables if values.dtype != levels.dtype: values = com._ensure_object(values) levels = com._ensure_object(levels) (hash_klass, vec_klass), vals = _get_data_algo(values, _hashtables) t = hash_klass(len(levels)) t.map_locations(com._values_from_object(levels)) return com._ensure_platform_int(t.lookup(values))
def describe(self): """ Describes this Categorical Returns ------- description: `DataFrame` A dataframe with frequency and counts by category. """ # Hack? from pandas.core.frame import DataFrame counts = DataFrame({ 'codes' : self._codes, 'values' : self._codes } ).groupby('codes').count() freqs = counts / float(counts.sum()) from pandas.tools.merge import concat result = concat([counts,freqs],axis=1) result.columns = ['counts','freqs'] # fill in the real categories check = result.index == -1 if check.any(): # Sort -1 (=NaN) to the last position index = np.arange(0, len(self.categories)+1, dtype='int64') index[-1] = -1 result = result.reindex(index) # build new index categories = np.arange(0,len(self.categories)+1 ,dtype=object) categories[:-1] = self.categories categories[-1] = np.nan result.index = categories.take(com._ensure_platform_int(result.index)) else: result.index = self.categories.take(com._ensure_platform_int(result.index)) result = result.reindex(self.categories) result.index.name = 'categories' return result
def _get_codes_for_values(values, categories): """" utility routine to turn values into codes given the specified categories """ from pandas.core.algorithms import _get_data_algo, _hashtables if values.dtype != categories.dtype: values = com._ensure_object(values) categories = com._ensure_object(categories) (hash_klass, vec_klass), vals = _get_data_algo(values, _hashtables) t = hash_klass(len(categories)) t.map_locations(com._values_from_object(categories)) return com._ensure_platform_int(t.lookup(values))
def get_group_levels(self): obs_ids = self.group_info[1] if self._overflow_possible: recons_labels = [np.array(x) for x in izip(*obs_ids)] else: recons_labels = decons_group_index(obs_ids, self.shape) name_list = [] for ping, labels in zip(self.groupings, recons_labels): labels = com._ensure_platform_int(labels) name_list.append(ping.group_index.take(labels)) return name_list
def _take_nd_generic(arr, indexer, out, axis, fill_value, mask_info): if mask_info is not None: mask, needs_masking = mask_info else: mask = indexer == -1 needs_masking = mask.any() if arr.dtype != out.dtype: arr = arr.astype(out.dtype) if arr.shape[axis] > 0: arr.take(com._ensure_platform_int(indexer), axis=axis, out=out) if needs_masking: outindexer = [slice(None)] * arr.ndim outindexer[axis] = mask out[tuple(outindexer)] = fill_value
def take(self, indexer, axis=0, allow_fill=True, fill_value=None): """ For internal compatibility with numpy arrays. # filling must always be None/nan here # but is passed thru internally assert isnull(fill_value) See also -------- numpy.ndarray.take """ indexer = com._ensure_platform_int(indexer) taken = self.codes.take(indexer) return self._create_from_codes(taken)
def _make_sorted_values_labels(self): v = self.level labs = list(self.index.labels) levs = list(self.index.levels) to_sort = labs[:v] + labs[v + 1:] + [labs[v]] sizes = [len(x) for x in levs[:v] + levs[v + 1:] + [levs[v]]] comp_index, obs_ids = get_compressed_ids(to_sort, sizes) ngroups = len(obs_ids) indexer = algos.groupsort_indexer(comp_index, ngroups)[0] indexer = _ensure_platform_int(indexer) self.sorted_values = com.take_nd(self.values, indexer, axis=0) self.sorted_labels = [l.take(indexer) for l in to_sort]
def take(self, indices, axis=0, allow_fill=True, fill_value=None): """ Analogous to ndarray.take """ indices = com._ensure_int64(indices) maybe_slice = lib.maybe_indices_to_slice(indices, len(self)) if isinstance(maybe_slice, slice): return self[maybe_slice] taken = self.asi8.take(com._ensure_platform_int(indices)) # only fill if we are passing a non-None fill_value if allow_fill and fill_value is not None: mask = indices == -1 if mask.any(): taken[mask] = tslib.iNaT return self._shallow_copy(taken, freq=None)