def test_ufunc_unique_b(self): a1 = np.array([[1, 1], [1, 2], [1, 2]]) post = ufunc_unique(a1) self.assertEqual(post.tolist(), [1, 2]) post = ufunc_unique(a1, axis=0) self.assertEqual(post.tolist(), [[1, 1], [1, 2]]) post = ufunc_unique(a1, axis=1) self.assertEqual(post.tolist(), [[1, 1], [1, 2], [1, 2]])
def test_ufunc_unique_c(self): a1 = np.array([[1, 'x', 1], [1, None, 1], [1, 'x', 1]], dtype=object) post = ufunc_unique(a1) self.assertEqual(post, {'x', 1, None}) post = ufunc_unique(a1, axis=0) self.assertEqual(post, {(1, 'x', 1), (1, None, 1)}) post = ufunc_unique(a1, axis=1) self.assertEqual(post, {(1, 1, 1), ('x', None, 'x')})
def test_ufunc_unique_a(self): a1 = np.array([1, 1, 1, 2, 2]) post = ufunc_unique(a1) self.assertEqual(post.tolist(), [1, 2]) a2 = np.array([1, 1, 1, 2, 2], dtype=object) post = ufunc_unique(a2) self.assertEqual(post.tolist(), [1, 2]) a3 = np.array([1, 'x', 1, None, 2], dtype=object) post = ufunc_unique(a3) self.assertEqual(post, {None, 1, 2, 'x'})
def test_ufunc_unique_b(self) -> None: a1 = np.array([[1, 1], [1, 2], [1, 2]]) post = ufunc_unique(a1) assert isinstance(post, np.ndarray) self.assertEqual(post.tolist(), [1, 2]) post = ufunc_unique(a1, axis=0) assert isinstance(post, np.ndarray) self.assertEqual(post.tolist(), [[1, 1], [1, 2]]) post = ufunc_unique(a1, axis=1) assert isinstance(post, np.ndarray) self.assertEqual(post.tolist(), [[1, 1], [1, 2], [1, 2]])
def pivot_outer_index( frame: 'Frame', index_fields: tp.Sequence[tp.Hashable], index_depth: int, index_constructor: IndexConstructor = None, ) -> IndexBase: index_loc = index_fields if index_depth > 1 else index_fields[0] if index_depth == 1: index_values = ufunc_unique(frame._blocks._extract_array_column( frame._columns._loc_to_iloc(index_loc)), axis=0) name = index_fields[0] index_inner = index_from_optional_constructor( index_values, default_constructor=partial(Index, name=name), explicit_constructor=None if index_constructor is None else partial(index_constructor, name=name), ) else: # > 1 # NOTE: this might force type an undesirable consolidation index_values = ufunc_unique(frame._blocks._extract_array( column_key=frame._columns._loc_to_iloc(index_loc)), axis=0) # NOTE: if index_types need to be provided to an IH here, they must be partialed in the single-argument index_constructor name = tuple(index_fields) index_inner = index_from_optional_constructor( # type: ignore index_values, default_constructor=partial( IndexHierarchy.from_labels, name=name, ), explicit_constructor=None if index_constructor is None else partial(index_constructor, name=name), ).flat() return index_inner
def test_ufunc_unique(self, array: np.ndarray) -> None: post = util.ufunc_unique(array, axis=0) self.assertTrue(len(post) <= array.shape[0])
def unique(self) -> np.ndarray: ''' Return a NumPy array of unqiue values. ''' return ufunc_unique(self.values)
def pivot_core( *, frame: 'Frame', index_fields: tp.List[tp.Hashable], columns_fields: tp.List[tp.Hashable], data_fields: tp.List[tp.Hashable], func_fields: tp.Tuple[tp.Hashable, ...], func_single: tp.Optional[UFunc], func_map: tp.Sequence[tp.Tuple[tp.Hashable, UFunc]], fill_value: object = np.nan, index_constructor: IndexConstructor = None, ) -> 'Frame': '''Core implementation of Frame.pivot(). The Frame has already been reduced to just relevant columns, and all fields groups are normalized as lists of hashables. ''' from static_frame.core.series import Series from static_frame.core.frame import Frame data_fields_len = len(data_fields) index_depth = len(index_fields) # all are lists of hashables; get converted to lists of integers columns_loc_to_iloc = frame.columns._loc_to_iloc index_fields_iloc: tp.Sequence[int] = columns_loc_to_iloc( index_fields) #type: ignore data_fields_iloc: tp.Sequence[int] = columns_loc_to_iloc( data_fields) #type: ignore columns_fields_iloc: tp.Sequence[int] = columns_loc_to_iloc( columns_fields) #type: ignore # For data fields, we add the field name, not the field values, to the columns. columns_name = tuple(columns_fields) if data_fields_len > 1 or not columns_fields: # if no columns_fields, have to add values label columns_name = tuple(chain(*columns_fields, ('values', ))) if len(func_map) > 1: columns_name = columns_name + ('func', ) columns_depth = len(columns_name) if columns_depth == 1: columns_name = columns_name[0] # type: ignore columns_constructor = partial(frame._COLUMNS_CONSTRUCTOR, name=columns_name) else: columns_constructor = partial( frame._COLUMNS_HIERARCHY_CONSTRUCTOR.from_labels, depth_reference=columns_depth, name=columns_name) dtype_map = frame.dtypes dtypes_per_data_fields = tuple( pivot_records_dtypes( dtype_map=dtype_map, data_fields=data_fields, func_single=func_single, func_map=func_map, )) if func_single and data_fields_len == 1: dtype_single = ufunc_dtype_to_dtype(func_single, dtype_map[data_fields[0]]) #--------------------------------------------------------------------------- # first major branch: if we are only grouping be index fields if not columns_fields: # group by is only index_fields columns = data_fields if func_single else tuple( product(data_fields, func_fields)) # NOTE: at this time we do not automatically give back an IndexHierarchy when index_depth is == 1, as the order of the resultant values may not be hierarchable. name_index = index_fields[0] if index_depth == 1 else tuple( index_fields) if index_constructor: index_constructor = partial(index_constructor, name=name_index) else: index_constructor = partial(Index, name=name_index) if len(columns) == 1: # assert len(data_fields) == 1 f = frame.from_series(Series.from_items( pivot_items( blocks=frame._blocks, group_fields_iloc=index_fields_iloc, group_depth=index_depth, data_field_iloc=data_fields_iloc[0], func_single=func_single, ), name=columns[0], index_constructor=index_constructor, dtype=dtype_single, ), columns_constructor=columns_constructor) else: f = frame.from_records_items( pivot_records_items( blocks=frame._blocks, group_fields_iloc=index_fields_iloc, group_depth=index_depth, data_fields_iloc=data_fields_iloc, func_single=func_single, func_map=func_map, ), columns_constructor=columns_constructor, columns=columns, index_constructor=index_constructor, dtypes=dtypes_per_data_fields, ) # have to rename columns if derived in from_concat columns_final = (f.columns.rename(columns_name) if columns_depth == 1 else columns_constructor(f.columns)) return f.relabel(columns=columns_final) #type: ignore #--------------------------------------------------------------------------- # second major branch: we are only grouping be index and columns fields # avoid doing a multi-column-style selection if not needed if len(columns_fields) == 1: # columns_group = columns_fields[0] retuple_group_label = True else: # columns_group = columns_fields retuple_group_label = False columns_loc_to_iloc = frame.columns._loc_to_iloc # group by on 1 or more columns fields # NOTE: explored doing one group on index and coluns that insert into pre-allocated arrays, but that proved slower than this approach group_key = columns_fields_iloc if len( columns_fields_iloc) > 1 else columns_fields_iloc[0] index_outer = pivot_outer_index( frame=frame, index_fields=index_fields, index_depth=index_depth, index_constructor=index_constructor, ) # collect subframes based on an index of tuples and columns of tuples (if depth > 1) sub_blocks = [] sub_columns_collected: tp.List[tp.Hashable] = [] # for group, sub in frame.iter_group_items(columns_group): for group, _, sub in frame._blocks.group(axis=0, key=group_key): # derive the column fields represented by this group sub_columns = extrapolate_column_fields( columns_fields, group if not retuple_group_label else (group, ), data_fields, func_fields) sub_columns_collected.extend(sub_columns) # sub is TypeBlocks unique value in columns_group; this may or may not have unique index fields; if not, it needs to be aggregated if index_depth == 1: sub_index_labels = sub._extract_array_column(index_fields_iloc[0]) sub_index_labels_unique = ufunc_unique(sub_index_labels) else: # match to an index of tuples; the order might not be the same as IH # NOTE: might be able to keep arays and concat below sub_index_labels = tuple( zip(*(sub._extract_array_column(columns_loc_to_iloc(f)) for f in index_fields))) sub_index_labels_unique = set(sub_index_labels) sub_frame: tp.Union[Frame, Series] # if sub_index_labels are not unique we need to aggregate if len(sub_index_labels_unique) != len(sub_index_labels): # if sub_columns length is 1, that means that we only need to extract one column out of the sub Frame if len(sub_columns) == 1: assert len(data_fields) == 1 # NOTE: grouping on index_fields; can pre-process array_to_groups_and_locations sub_frame = Series.from_items( pivot_items( blocks=sub, group_fields_iloc=index_fields_iloc, group_depth=index_depth, data_field_iloc=data_fields_iloc[0], func_single=func_single, ), dtype=dtype_single, ) else: sub_frame = Frame.from_records_items( pivot_records_items(blocks=sub, group_fields_iloc=index_fields_iloc, group_depth=index_depth, data_fields_iloc=data_fields_iloc, func_single=func_single, func_map=func_map), dtypes=dtypes_per_data_fields, ) else: # we have unique values per index item, but may not have a complete index if func_single: # NOTE: should apply function even with func_single if len(data_fields) == 1: sub_frame = Frame(sub._extract_array_column( data_fields_iloc[0]), index=sub_index_labels, index_constructor=index_constructor, own_data=True) else: sub_frame = Frame(sub._extract( row_key=None, column_key=data_fields_iloc), index=sub_index_labels, index_constructor=index_constructor, own_data=True) else: def blocks() -> tp.Iterator[np.ndarray]: for field in data_fields_iloc: for _, func in func_map: yield sub._extract_array_column(field) sub_frame = Frame( TypeBlocks.from_blocks(blocks()), index=sub_index_labels, own_data=True, ) sub_frame = sub_frame.reindex( index_outer, own_index=True, fill_value=fill_value, ) if sub_frame.ndim == 1: sub_blocks.append(sub_frame.values) else: sub_blocks.extend(sub_frame._blocks._blocks) # type: ignore tb = TypeBlocks.from_blocks(sub_blocks) return frame.__class__( tb, index=index_outer, columns=columns_constructor(sub_columns_collected), own_data=True, own_index=True, own_columns=True, )