def test_datetime64_dtype_array_returned(self): # GH 9431 expected = np.array( ["2015-01-03T00:00:00.000000000+0000", "2015-01-01T00:00:00.000000000+0000"], dtype="M8[ns]" ) dt_index = pd.to_datetime( [ "2015-01-03T00:00:00.000000000+0000", "2015-01-01T00:00:00.000000000+0000", "2015-01-01T00:00:00.000000000+0000", ] ) result = algos.unique(dt_index) tm.assert_numpy_array_equal(result, expected) self.assertEqual(result.dtype, expected.dtype) s = pd.Series(dt_index) result = algos.unique(s) tm.assert_numpy_array_equal(result, expected) self.assertEqual(result.dtype, expected.dtype) arr = s.values result = algos.unique(arr) tm.assert_numpy_array_equal(result, expected) self.assertEqual(result.dtype, expected.dtype)
def _get_wom_rule(self): wdiffs = unique(np.diff(self.index.week)) if not lib.ismember(wdiffs, set([4, 5])).all(): return None weekdays = unique(self.index.weekday) if len(weekdays) > 1: return None # get which week week = (self.index[0].day - 1) // 7 + 1 wd = _weekday_rule_aliases[weekdays[0]] return "WOM-%d%s" % (week, wd)
def _maybe_cache(arg, format, cache, tz, convert_listlike): """ Create a cache of unique dates from an array of dates Parameters ---------- arg : integer, float, string, datetime, list, tuple, 1-d array, Series format : string Strftime format to parse time cache : boolean True attempts to create a cache of converted values tz : string Timezone of the dates convert_listlike : function Conversion function to apply on dates Returns ------- cache_array : Series Cache of converted, unique dates. Can be empty """ from pandas import Series cache_array = Series() if cache: # Perform a quicker unique check from pandas import Index if not Index(arg).is_unique: unique_dates = algorithms.unique(arg) cache_dates = convert_listlike(unique_dates, True, format, tz=tz) cache_array = Series(cache_dates, index=unique_dates) return cache_array
def _infer_precision(base_precision, bins): """Infer an appropriate precision for _round_frac """ for precision in range(base_precision, 20): levels = [_round_frac(b, precision) for b in bins] if algos.unique(levels).size == bins.size: return precision return base_precision # default
def _bins_to_cuts(x, bins, right=True, labels=None, precision=3, include_lowest=False, dtype=None, duplicates='raise'): if duplicates not in ['raise', 'drop']: raise ValueError("invalid value for 'duplicates' parameter, " "valid options are: raise, drop") unique_bins = algos.unique(bins) if len(unique_bins) < len(bins): if duplicates == 'raise': raise ValueError("Bin edges must be unique: {}.\nYou " "can drop duplicate edges by setting " "the 'duplicates' kwarg".format(repr(bins))) else: bins = unique_bins side = 'left' if right else 'right' ids = bins.searchsorted(x, side=side) if include_lowest: ids[x == bins[0]] = 1 na_mask = isnull(x) | (ids == len(bins)) | (ids == 0) has_nas = na_mask.any() if labels is not False: if labels is None: increases = 0 while True: try: levels = _format_levels(bins, precision, right=right, include_lowest=include_lowest, dtype=dtype) except ValueError: increases += 1 precision += 1 if increases >= 20: raise else: break else: if len(labels) != len(bins) - 1: raise ValueError('Bin labels must be one fewer than ' 'the number of bin edges') levels = labels levels = np.asarray(levels, dtype=object) np.putmask(ids, na_mask, 0) fac = Categorical(ids - 1, levels, ordered=True, fastpath=True) else: fac = ids - 1 if has_nas: fac = fac.astype(np.float64) np.putmask(fac, na_mask, np.nan) return fac, bins
def _bins_to_cuts(x, bins, right=True, labels=None, retbins=False, precision=3, name=None, include_lowest=False): x_is_series = isinstance(x, Series) series_index = None if x_is_series: series_index = x.index if name is None: name = x.name x = np.asarray(x) side = "left" if right else "right" ids = bins.searchsorted(x, side=side) if len(algos.unique(bins)) < len(bins): raise ValueError("Bin edges must be unique: %s" % repr(bins)) if include_lowest: ids[x == bins[0]] = 1 na_mask = isnull(x) | (ids == len(bins)) | (ids == 0) has_nas = na_mask.any() if labels is not False: if labels is None: increases = 0 while True: try: levels = _format_levels(bins, precision, right=right, include_lowest=include_lowest) except ValueError: increases += 1 precision += 1 if increases >= 20: raise else: break else: if len(labels) != len(bins) - 1: raise ValueError("Bin labels must be one fewer than " "the number of bin edges") levels = labels levels = np.asarray(levels, dtype=object) np.putmask(ids, na_mask, 0) fac = Categorical(ids - 1, levels, ordered=True, fastpath=True) else: fac = ids - 1 if has_nas: fac = fac.astype(np.float64) np.putmask(fac, na_mask, np.nan) if x_is_series: fac = Series(fac, index=series_index, name=name) if not retbins: return fac return fac, bins
def _get_annual_rule(self): if len(self.ydiffs) > 1: return None if len(algos.unique(self.fields["M"])) > 1: return None pos_check = self.month_position_check() return {"cs": "AS", "bs": "BAS", "ce": "A", "be": "BA"}.get(pos_check)
def test_timedelta64_dtype_array_returned(self): # GH 9431 expected = np.array([31200, 45678, 10000], dtype='m8[ns]') td_index = pd.to_timedelta([31200, 45678, 31200, 10000, 45678]) result = algos.unique(td_index) tm.assert_numpy_array_equal(result, expected) self.assertEqual(result.dtype, expected.dtype) s = pd.Series(td_index) result = algos.unique(s) tm.assert_numpy_array_equal(result, expected) self.assertEqual(result.dtype, expected.dtype) arr = s.values result = algos.unique(arr) tm.assert_numpy_array_equal(result, expected) self.assertEqual(result.dtype, expected.dtype)
def _bins_to_cuts(x, bins, right=True, labels=None, precision=3, include_lowest=False, dtype=None, duplicates='raise'): if duplicates not in ['raise', 'drop']: raise ValueError("invalid value for 'duplicates' parameter, " "valid options are: raise, drop") if isinstance(bins, IntervalIndex): # we have a fast-path here ids = bins.get_indexer(x) result = algos.take_nd(bins, ids) result = Categorical(result, categories=bins, ordered=True) return result, bins unique_bins = algos.unique(bins) if len(unique_bins) < len(bins) and len(bins) != 2: if duplicates == 'raise': raise ValueError("Bin edges must be unique: {bins!r}.\nYou " "can drop duplicate edges by setting " "the 'duplicates' kwarg".format(bins=bins)) else: bins = unique_bins side = 'left' if right else 'right' ids = _ensure_int64(bins.searchsorted(x, side=side)) if include_lowest: # Numpy 1.9 support: ensure this mask is a Numpy array ids[np.asarray(x == bins[0])] = 1 na_mask = isna(x) | (ids == len(bins)) | (ids == 0) has_nas = na_mask.any() if labels is not False: if labels is None: labels = _format_labels(bins, precision, right=right, include_lowest=include_lowest, dtype=dtype) else: if len(labels) != len(bins) - 1: raise ValueError('Bin labels must be one fewer than ' 'the number of bin edges') if not is_categorical_dtype(labels): labels = Categorical(labels, categories=labels, ordered=True) np.putmask(ids, na_mask, 0) result = algos.take_nd(labels, ids - 1) else: result = ids - 1 if has_nas: result = result.astype(np.float64) np.putmask(result, na_mask, np.nan) return result, bins
def _get_annual_rule(self): if len(self.ydiffs) > 1: return None if len(algos.unique(self.fields['M'])) > 1: return None pos_check = self.month_position_check() return {'cs': 'AS', 'bs': 'BAS', 'ce': 'A', 'be': 'BA'}.get(pos_check)
def _get_wom_rule(self): # wdiffs = unique(np.diff(self.index.week)) #We also need -47, -49, -48 to catch index spanning year boundary # if not lib.ismember(wdiffs, set([4, 5, -47, -49, -48])).all(): # return None weekdays = unique(self.index.weekday) if len(weekdays) > 1: return None week_of_months = unique((self.index.day - 1) // 7) if len(week_of_months) > 1: return None # get which week week = week_of_months[0] + 1 wd = _weekday_rule_aliases[weekdays[0]] return 'WOM-%d%s' % (week, wd)
def _get_wom_rule(self): # wdiffs = unique(np.diff(self.index.week)) # We also need -47, -49, -48 to catch index spanning year boundary # if not lib.ismember(wdiffs, set([4, 5, -47, -49, -48])).all(): # return None weekdays = unique(self.index.weekday) if len(weekdays) > 1: return None week_of_months = unique((self.index.day - 1) // 7) # Only attempt to infer up to WOM-4. See #9425 week_of_months = week_of_months[week_of_months < 4] if len(week_of_months) == 0 or len(week_of_months) > 1: return None # get which week week = week_of_months[0] + 1 wd = _weekday_rule_aliases[weekdays[0]] return 'WOM-{week}{weekday}'.format(week=week, weekday=wd)
def _bins_to_cuts(x, bins, right=True, labels=None, precision=3, include_lowest=False, dtype=None): side = 'left' if right else 'right' ids = bins.searchsorted(x, side=side) if len(algos.unique(bins)) < len(bins): raise ValueError('Bin edges must be unique: %s' % repr(bins)) if include_lowest: ids[x == bins[0]] = 1 na_mask = isnull(x) | (ids == len(bins)) | (ids == 0) has_nas = na_mask.any() if labels is not False: if labels is None: increases = 0 while True: try: levels = _format_levels(bins, precision, right=right, include_lowest=include_lowest, dtype=dtype) except ValueError: increases += 1 precision += 1 if increases >= 20: raise else: break else: if len(labels) != len(bins) - 1: raise ValueError('Bin labels must be one fewer than ' 'the number of bin edges') levels = labels levels = np.asarray(levels, dtype=object) np.putmask(ids, na_mask, 0) fac = Categorical(ids - 1, levels, ordered=True, fastpath=True) else: fac = ids - 1 if has_nas: fac = fac.astype(np.float64) np.putmask(fac, na_mask, np.nan) return fac, bins
def _read_table_native(self,group,where,name=None): if name == None: name = getattr(group._v_attrs, 'tables')[0] table = getattr(group, name) #if not found then some sort of default behaviour maybe? info = table.attrs._pandas_info #no selection implemented #need to deal with tz info etc. indices = [] index_names = [] column_names = [] data = [] for i,col in enumerate(table.colnames): if info[col].get('isIndex',False): indices.append(_maybe_convert(table.read(field=col), info[col]['kind'])) index_names.append(info[col]['name_data']) else: data.append(_maybe_convert(table.read(field=col), info[col]['kind'])) column_names.append(info[col]['name_data']) index = MultiIndex.from_arrays(indices,names=index_names) kind = info['pandas_type'] if kind == 'series' and len(column_names) == 1: return Series(data=data[0],index=index,name=column_names[0]) elif kind == 'frame': if len(unique(column_names)) == len(column_names): return DataFrame(dict(zip(column_names,data)),index=index) else: raise NotImplementedError, "No support for duplicate column names" else: raise NotImplementedError, "Only series and frame are supported at this time"
def _read_panel_table(self, group, where=None): table = getattr(group, 'table') fields = table._v_attrs.fields # create the selection sel = Selection(table, where, table._v_attrs.index_kind) sel.select() fields = table._v_attrs.fields columns = _maybe_convert(sel.values['column'], table._v_attrs.columns_kind) index = _maybe_convert(sel.values['index'], table._v_attrs.index_kind) values = sel.values['values'] major = Factor(index) minor = Factor(columns) J, K = len(major.levels), len(minor.levels) key = major.labels * K + minor.labels if len(unique(key)) == len(key): sorter, _ = lib.groupsort_indexer(key, J * K) # the data need to be sorted sorted_values = values.take(sorter, axis=0) major_labels = major.labels.take(sorter) minor_labels = minor.labels.take(sorter) block = block2d_to_block3d(sorted_values, fields, (J, K), major_labels, minor_labels) mgr = BlockManager([block], [block.items, major.levels, minor.levels]) wp = Panel(mgr) else: if not self._quiet: # pragma: no cover print ('Duplicate entries in table, taking most recently ' 'appended') # reconstruct long_index = MultiIndex.from_arrays([index, columns]) lp = DataFrame(values, index=long_index, columns=fields) # need a better algorithm tuple_index = long_index.get_tuple_index() unique_tuples = lib.fast_unique(tuple_index) unique_tuples = _asarray_tuplesafe(unique_tuples) indexer = match(unique_tuples, tuple_index) new_index = long_index.take(indexer) new_values = lp.values.take(indexer, axis=0) lp = DataFrame(new_values, index=new_index, columns=lp.columns) wp = lp.to_panel() if sel.column_filter: new_minor = sorted(set(wp.minor_axis) & sel.column_filter) wp = wp.reindex(minor=new_minor) return wp
def _bins_to_cuts(x, bins, right=True, labels=None, precision=3, include_lowest=False, dtype=None, duplicates='raise'): if duplicates not in ['raise', 'drop']: raise ValueError("invalid value for 'duplicates' parameter, " "valid options are: raise, drop") if isinstance(bins, IntervalIndex): # we have a fast-path here ids = bins.get_indexer(x) result = algos.take_nd(bins, ids) result = Categorical(result, categories=bins, ordered=True) return result, bins unique_bins = algos.unique(bins) if len(unique_bins) < len(bins) and len(bins) != 2: if duplicates == 'raise': raise ValueError("Bin edges must be unique: {bins!r}.\nYou " "can drop duplicate edges by setting " "the 'duplicates' kwarg".format(bins=bins)) else: bins = unique_bins side = 'left' if right else 'right' ids = ensure_int64(bins.searchsorted(x, side=side)) if include_lowest: ids[x == bins[0]] = 1 na_mask = isna(x) | (ids == len(bins)) | (ids == 0) has_nas = na_mask.any() if labels is not False: if labels is None: labels = _format_labels(bins, precision, right=right, include_lowest=include_lowest, dtype=dtype) else: if len(labels) != len(bins) - 1: raise ValueError('Bin labels must be one fewer than ' 'the number of bin edges') if not is_categorical_dtype(labels): labels = Categorical(labels, categories=labels, ordered=True) np.putmask(ids, na_mask, 0) result = algos.take_nd(labels, ids - 1) else: result = ids - 1 if has_nas: result = result.astype(np.float64) np.putmask(result, na_mask, np.nan) return result, bins
def _bins_to_cuts( x, bins, right: bool = True, labels=None, precision: int = 3, include_lowest: bool = False, dtype=None, duplicates: str = "raise", ): if duplicates not in ["raise", "drop"]: raise ValueError( "invalid value for 'duplicates' parameter, valid options are: raise, drop" ) if isinstance(bins, IntervalIndex): # we have a fast-path here ids = bins.get_indexer(x) result = Categorical.from_codes(ids, categories=bins, ordered=True) return result, bins unique_bins = algos.unique(bins) if len(unique_bins) < len(bins) and len(bins) != 2: if duplicates == "raise": raise ValueError( f"Bin edges must be unique: {repr(bins)}.\n" f"You can drop duplicate edges by setting the 'duplicates' kwarg" ) else: bins = unique_bins side = "left" if right else "right" ids = ensure_int64(bins.searchsorted(x, side=side)) if include_lowest: ids[x == bins[0]] = 1 na_mask = isna(x) | (ids == len(bins)) | (ids == 0) has_nas = na_mask.any() if labels is not False: if not (labels is None or is_list_like(labels)): raise ValueError( "Bin labels must either be False, None or passed in as a " "list-like argument") elif labels is None: labels = _format_labels(bins, precision, right=right, include_lowest=include_lowest, dtype=dtype) else: if len(labels) != len(bins) - 1: raise ValueError( "Bin labels must be one fewer than the number of bin edges" ) if not is_categorical_dtype(labels): labels = Categorical(labels, categories=labels, ordered=True) np.putmask(ids, na_mask, 0) result = algos.take_nd(labels, ids - 1) else: result = ids - 1 if has_nas: result = result.astype(np.float64) np.putmask(result, na_mask, np.nan) return result, bins
def test_object_refcount_bug(self): lst = ["A", "B", "C", "D", "E"] for i in range(1000): len(algos.unique(lst))
def test_object_refcount_bug(self): lst = ['A', 'B', 'C', 'D', 'E'] for i in range(1000): len(algos.unique(lst))
def unique( self: NDArrayBackedExtensionArrayT ) -> NDArrayBackedExtensionArrayT: new_data = unique(self._ndarray) return self._from_backing_data(new_data)
def precalculate_probability_table_dynamic_programming( D: pd.DataFrame, G: nx.DiGraph, ci: Hashable) -> np.ndarray: n = D.shape[0] H = np.zeros((n, n)) P = [p for p in G.predecessors(ci)] C = [c for c in G.successors(ci)] S = [None] * len(C) J_P = 1 S_c = [None] * len(C) J_C = [1] * len(C) J_S = [1] * len(C) for p in P: J_P *= len(np.unique(D[p])) for i, c in enumerate(C): S[i] = [s for s in G.predecessors(c)] S[i].remove(ci) S_c[i] = [s for s in S[i]] S_c[i].append(c) J_C[i] = len(np.unique(D[c])) for spouse in S[i]: J_S[i] *= len(np.unique(D[spouse])) for v in range(n): for u in range(v + 1): H[u, v] = math.log(sc.special.comb(v - u + J_P, J_P - 1)) # Parent table for p in P: p_dist = pd.get_dummies(D[p]).to_numpy() dist_table = np.zeros((n, n, len(p_dist[0, :])), dtype=int) for v in range(n): for u in range(v + 1): # fill dist_table if v == u: dist_table[u, v] = p_dist[v, :] else: dist_table[u, v] = dist_table[u, v - 1] + p_dist[v, :] # calculate probability h = math.log(math.factorial(v + 1 - u)) #h -= sum(np.log(sc.special.factorial(dist_table[u, v]))) h -= sum(sc.special.gammaln(dist_table[u, v] + 1)) H[u, v] += h # Child-Spouse table for i, c in enumerate(C): c_dist = pd.get_dummies(D[c]).to_numpy() n_c = len(c_dist[0, :]) s_class: pd.Series if len(S[i]) > 0: s_class = D[S[i]].groupby(S[i]).ngroup() else: s_class = pd.Series(np.zeros(n)) n_s_class = len(unique(s_class)) dist_table = np.zeros((n, n, n_s_class, n_c), dtype=int) for v in range(n): for u in range(v + 1): h = 0 for i_s_class in range(n_s_class): z = np.zeros(n_c) if i_s_class == s_class[v]: z = c_dist[v, :] # fill dist_table if v == u: dist_table[u, v, i_s_class] = z else: dist_table[u, v, i_s_class] = dist_table[u, v - 1, i_s_class] + z # calculate probability c_over_s_dist = dist_table[u, v, i_s_class] n_c_over_s = sum(c_over_s_dist) #h += math.log(sc.special.comb(n_c_over_s + J_C[i] - 1, J_C[i] - 1)) #h += math.log(math.factorial(n_c_over_s)) #h -= sum(np.log(sc.special.factorial(c_over_s_dist))) # Vectors for faster gammaln calculation add = np.asarray([n_c_over_s + J_C[i], n_c_over_s + 1]) sub = np.append([J_C[i], n_c_over_s + 1], c_over_s_dist + 1) h += sum(sc.special.gammaln(add)) h -= sum(sc.special.gammaln(sub)) H[u, v] += h return H
def precalculate_probability_table_split_up_numpy(D: pd.DataFrame, G: nx.DiGraph, ci: Hashable) -> np.ndarray: n = D.shape[0] H = np.zeros((n, n)) P = [p for p in G.predecessors(ci)] C = [c for c in G.successors(ci)] S = [None] * len(C) J_P = 1 S_c = [None] * len(C) J_C = [1] * len(C) J_S = [1] * len(C) for p in P: J_P *= len(np.unique(D[p])) for i, c in enumerate(C): S[i] = [s for s in G.predecessors(c)] S[i].remove(ci) S_c[i] = [s for s in S[i]] S_c[i].append(c) J_C[i] = len(np.unique(D[c])) for spouse in S[i]: J_S[i] *= len(np.unique(D[spouse])) vSu = np.zeros((n, n)) for v in range(n): for u in range(v + 1): vSu[u, v] = v - u H = sc.special.gammaln(vSu + J_P + 1) H -= sc.special.gammaln(vSu + 2) + math.log(math.factorial(J_P - 1)) # Parent table for p in P: p_dist = pd.get_dummies(D[p]).to_numpy() J_p = p_dist.shape[1] dist_table = np.reshape(np.tile(p_dist, (n, 1)), (n, n, J_p)) tril_index = np.tril_indices(n, k=-1) dist_table[tril_index] = np.zeros(J_p) dist_table = np.cumsum(dist_table, axis=1) H += sc.special.gammaln(vSu + 2) H -= np.sum(sc.special.gammaln(dist_table + 1), axis=-1) # Child-Spouse table for i, c in enumerate(C): c_dist = pd.get_dummies(D[c]).to_numpy() n_c = len(c_dist[0, :]) s_class: pd.Series if len(S[i]) > 0: s_class = D[S[i]].groupby(S[i]).ngroup() else: s_class = pd.Series(np.zeros(n)) n_s_class = len(unique(s_class)) dist_table = np.zeros((n, n_s_class, n_c), dtype=int) for v in range(n): for i_s_class in range(n_s_class): z = np.zeros(n_c) if i_s_class == s_class[v]: z = c_dist[v, :] dist_table[v, i_s_class] = z intval_table = np.reshape(np.tile(dist_table, (n, 1, 1)), (n, n, n_s_class, n_c)) tril_index = np.tril_indices(n, k=-1) intval_table[tril_index] = np.zeros((n_s_class, n_c)) intval_table = np.cumsum(intval_table, axis=1) n_c_over_s_table = np.sum(intval_table, axis=-1) H += np.sum(sc.special.gammaln(n_c_over_s_table + J_C[i]), axis=-1) H -= math.log(math.factorial(J_C[i] - 1)) * n_s_class H -= np.sum(sc.special.gammaln(intval_table + 1), axis=(-1, -2)) H = np.triu(H) return H
def remove_unused_levels(self): """ create a new MultiIndex from the current that removing unused levels, meaning that they are not expressed in the labels The resulting MultiIndex will have the same outward appearance, meaning the same .values and ordering. It will also be .equals() to the original. .. versionadded:: 0.20.0 Returns ------- MultiIndex Examples -------- >>> i = pd.MultiIndex.from_product([range(2), list('ab')]) MultiIndex(levels=[[0, 1], ['a', 'b']], codes=[[0, 0, 1, 1], [0, 1, 0, 1]]) >>> i[2:] MultiIndex(levels=[[0, 1], ['a', 'b']], codes=[[1, 1], [0, 1]]) The 0 from the first level is not represented and can be removed >>> i[2:].remove_unused_levels() MultiIndex(levels=[[1], ['a', 'b']], codes=[[0, 0], [0, 1]]) """ import pandas.core.algorithms as algos new_levels = [] new_labels = [] changed = False for lev, lab in zip(self.levels, self.labels): # Since few levels are typically unused, bincount() is more # efficient than unique() - however it only accepts positive values # (and drops order): uniques = np.where(np.bincount(lab + 1) > 0)[0] - 1 has_na = int(len(uniques) and (uniques[0] == -1)) if len(uniques) != len(lev) + has_na: # We have unused levels changed = True # Recalculate uniques, now preserving order. # Can easily be cythonized by exploiting the already existing # "uniques" and stop parsing "lab" when all items are found: uniques = algos.unique(lab) if has_na: na_idx = np.where(uniques == -1)[0] # Just ensure that -1 is in first position: uniques[[0, na_idx[0]]] = uniques[[na_idx[0], 0]] # labels get mapped from uniques to 0:len(uniques) # -1 (if present) is mapped to last position label_mapping = np.zeros(len(lev) + has_na) # ... and reassigned value -1: label_mapping[uniques] = np.arange(len(uniques)) - has_na lab = label_mapping[lab] # new levels are simple lev = lev.take(uniques[has_na:]) new_levels.append(lev) new_labels.append(lab) result = self._shallow_copy() if changed: result._reset_identity() result._set_levels(new_levels, validate=False) result._set_labels(new_labels, validate=False) return result
def test_objects(self): arr = np.random.randint(0, 100, size=50).astype('O') result = algos.unique(arr) self.assertTrue(isinstance(result, np.ndarray))
def unique(self): return type(self)(unique(self._ndarray))
def _bins_to_cuts(x, bins, right=True, labels=None, retbins=False, precision=3, name=None, include_lowest=False): x_is_series = isinstance(x, Series) series_index = None if x_is_series: series_index = x.index if name is None: name = x.name x = np.asarray(x) side = 'left' if right else 'right' ids = bins.searchsorted(x, side=side) if len(algos.unique(bins)) < len(bins): raise ValueError('Bin edges must be unique: %s' % repr(bins)) if include_lowest: ids[x == bins[0]] = 1 na_mask = com.isnull(x) | (ids == len(bins)) | (ids == 0) has_nas = na_mask.any() if labels is not False: if labels is None: increases = 0 while True: try: levels = _format_levels(bins, precision, right=right, include_lowest=include_lowest) except ValueError: increases += 1 precision += 1 if increases >= 20: raise else: break else: if len(labels) != len(bins) - 1: raise ValueError('Bin labels must be one fewer than ' 'the number of bin edges') levels = labels levels = np.asarray(levels, dtype=object) np.putmask(ids, na_mask, 0) fac = Categorical(ids - 1, levels, ordered=True, name=name, fastpath=True) else: fac = ids - 1 if has_nas: fac = fac.astype(np.float64) np.putmask(fac, na_mask, np.nan) if x_is_series: fac = Series(fac, index=series_index) if not retbins: return fac return fac, bins
def _read_panel_table(self, group, where=None): table = getattr(group, 'table') fields = table._v_attrs.fields # create the selection sel = Selection(table, where, table._v_attrs.index_kind) sel.select() fields = table._v_attrs.fields columns = _maybe_convert(sel.values['column'], table._v_attrs.columns_kind) index = _maybe_convert(sel.values['index'], table._v_attrs.index_kind) values = sel.values['values'] major = Factor.from_array(index) minor = Factor.from_array(columns) J, K = len(major.levels), len(minor.levels) key = major.labels * K + minor.labels if len(unique(key)) == len(key): sorter, _ = lib.groupsort_indexer(com._ensure_int64(key), J * K) sorter = com._ensure_platform_int(sorter) # the data need to be sorted sorted_values = values.take(sorter, axis=0) major_labels = major.labels.take(sorter) minor_labels = minor.labels.take(sorter) block = block2d_to_block3d(sorted_values, fields, (J, K), major_labels, minor_labels) mgr = BlockManager([block], [block.ref_items, major.levels, minor.levels]) wp = Panel(mgr) else: if not self._quiet: # pragma: no cover print( 'Duplicate entries in table, taking most recently ' 'appended') # reconstruct long_index = MultiIndex.from_arrays([index, columns]) lp = DataFrame(values, index=long_index, columns=fields) # need a better algorithm tuple_index = long_index._tuple_index unique_tuples = lib.fast_unique(tuple_index) unique_tuples = _asarray_tuplesafe(unique_tuples) indexer = match(unique_tuples, tuple_index) indexer = com._ensure_platform_int(indexer) new_index = long_index.take(indexer) new_values = lp.values.take(indexer, axis=0) lp = DataFrame(new_values, index=new_index, columns=lp.columns) wp = lp.to_panel() if sel.column_filter: new_minor = sorted(set(wp.minor_axis) & sel.column_filter) wp = wp.reindex(minor=new_minor) return wp
def unique(self) -> "PandasArray": return type(self)(unique(self._ndarray))
def remove_unused_levels(self): """ create a new MultiIndex from the current that removing unused levels, meaning that they are not expressed in the labels The resulting MultiIndex will have the same outward appearance, meaning the same .values and ordering. It will also be .equals() to the original. .. versionadded:: 0.20.0 Returns ------- MultiIndex Examples -------- >>> i = pd.MultiIndex.from_product([range(2), list('ab')]) MultiIndex(levels=[[0, 1], ['a', 'b']], labels=[[0, 0, 1, 1], [0, 1, 0, 1]]) >>> i[2:] MultiIndex(levels=[[0, 1], ['a', 'b']], labels=[[1, 1], [0, 1]]) The 0 from the first level is not represented and can be removed >>> i[2:].remove_unused_levels() MultiIndex(levels=[[1], ['a', 'b']], labels=[[0, 0], [0, 1]]) """ import pandas.core.algorithms as algos new_levels = [] new_labels = [] changed = False for lev, lab in zip(self.levels, self.labels): # Since few levels are typically unused, bincount() is more # efficient than unique() - however it only accepts positive values # (and drops order): uniques = np.where(np.bincount(lab + 1) > 0)[0] - 1 has_na = int(len(uniques) and (uniques[0] == -1)) if len(uniques) != len(lev) + has_na: # We have unused levels changed = True # Recalculate uniques, now preserving order. # Can easily be cythonized by exploiting the already existing # "uniques" and stop parsing "lab" when all items are found: uniques = algos.unique(lab) if has_na: na_idx = np.where(uniques == -1)[0] # Just ensure that -1 is in first position: uniques[[0, na_idx[0]]] = uniques[[na_idx[0], 0]] # labels get mapped from uniques to 0:len(uniques) # -1 (if present) is mapped to last position label_mapping = np.zeros(len(lev) + has_na) # ... and reassigned value -1: label_mapping[uniques] = np.arange(len(uniques)) - has_na lab = label_mapping[lab] # new levels are simple lev = lev.take(uniques[has_na:]) new_levels.append(lev) new_labels.append(lab) result = self._shallow_copy() if changed: result._reset_identity() result._set_levels(new_levels, validate=False) result._set_labels(new_labels, validate=False) return result
def melt( frame: DataFrame, id_vars=None, value_vars=None, var_name=None, value_name="value", col_level=None, ignore_index: bool = True, ) -> DataFrame: # If multiindex, gather names of columns on all level for checking presence # of `id_vars` and `value_vars` if isinstance(frame.columns, MultiIndex): cols = [x for c in frame.columns for x in c] else: cols = list(frame.columns) if value_name in frame.columns: warnings.warn( "This dataframe has a column name that matches the 'value_name' column " "name of the resulting Dataframe. " "In the future this will raise an error, please set the 'value_name' " "parameter of DataFrame.melt to a unique name.", FutureWarning, stacklevel=3, ) if id_vars is not None: if not is_list_like(id_vars): id_vars = [id_vars] elif isinstance(frame.columns, MultiIndex) and not isinstance(id_vars, list): raise ValueError( "id_vars must be a list of tuples when columns are a MultiIndex" ) else: # Check that `id_vars` are in frame id_vars = list(id_vars) missing = Index(com.flatten(id_vars)).difference(cols) if not missing.empty: raise KeyError("The following 'id_vars' are not present " f"in the DataFrame: {list(missing)}") else: id_vars = [] if value_vars is not None: if not is_list_like(value_vars): value_vars = [value_vars] elif isinstance(frame.columns, MultiIndex) and not isinstance(value_vars, list): raise ValueError( "value_vars must be a list of tuples when columns are a MultiIndex" ) else: value_vars = list(value_vars) # Check that `value_vars` are in frame missing = Index(com.flatten(value_vars)).difference(cols) if not missing.empty: raise KeyError("The following 'value_vars' are not present in " f"the DataFrame: {list(missing)}") if col_level is not None: idx = frame.columns.get_level_values(col_level).get_indexer( id_vars + value_vars) else: idx = algos.unique( frame.columns.get_indexer_for(id_vars + value_vars)) frame = frame.iloc[:, idx] else: frame = frame.copy() if col_level is not None: # allow list or other? # frame is a copy frame.columns = frame.columns.get_level_values(col_level) if var_name is None: if isinstance(frame.columns, MultiIndex): if len(frame.columns.names) == len(set(frame.columns.names)): var_name = frame.columns.names else: var_name = [ f"variable_{i}" for i in range(len(frame.columns.names)) ] else: var_name = [ frame.columns.name if frame.columns.name is not None else "variable" ] if isinstance(var_name, str): var_name = [var_name] N, K = frame.shape K -= len(id_vars) mdata = {} for col in id_vars: id_data = frame.pop(col) if is_extension_array_dtype(id_data): id_data = cast("Series", concat([id_data] * K, ignore_index=True)) else: id_data = np.tile(id_data._values, K) mdata[col] = id_data mcolumns = id_vars + var_name + [value_name] # error: Incompatible types in assignment (expression has type "ndarray", # target has type "Series") mdata[value_name] = frame._values.ravel("F") # type: ignore[assignment] for i, col in enumerate(var_name): # asanyarray will keep the columns as an Index # error: Incompatible types in assignment (expression has type "ndarray", target # has type "Series") mdata[col] = np.asanyarray( # type: ignore[assignment] frame.columns._get_level_values(i)).repeat(N) result = frame._constructor(mdata, columns=mcolumns) if not ignore_index: result.index = tile_compat(frame.index, K) return result
def test_ints(self): arr = np.random.randint(0, 100, size=50) result = algos.unique(arr) tm.assertIsInstance(result, np.ndarray)
def unique(self): uniques = list(algos.unique(self.sp_values)) fill_loc = self._first_fill_value_loc() if fill_loc >= 0: uniques.insert(fill_loc, self.fill_value) return type(self)._from_sequence(uniques, dtype=self.dtype)
def test_objects(self): arr = np.random.randint(0, 100, size=50).astype('O') result = algos.unique(arr) tm.assert_isinstance(result, np.ndarray)
def test_uint64_overflow(self): s = pd.Series([1, 2, 2**63, 2**63], dtype=np.uint64) exp = np.array([1, 2, 2**63], dtype=np.uint64) tm.assert_numpy_array_equal(algos.unique(s), exp)
def unique(self: _T) -> _T: new_data = unique(self._ndarray) return self._from_backing_data(new_data)