def test_store_xlsx_read_many_d(self) -> None: records = ( (2, 2, 'a', False, None), (30, 73, 'd', True, None), (None, None, None, None, None), (None, None, None, None, None), ) columns = IndexHierarchy.from_labels(( ('a', 1), ('a', 2), ('b', 1), ('b', 2), (None, None) )) f1 = Frame.from_records(records, columns=columns) with temp_file('.xlsx') as fp: f1.to_xlsx(fp, label='f1', include_index=False, include_columns=True) st1 = StoreXLSX(fp) c = StoreConfig( index_depth=0, columns_depth=2, trim_nadir=True, ) f2 = next(st1.read_many(('f1',), config=c)) self.assertEqual(f2.shape, (2, 4)) self.assertEqual(f2.to_pairs(), ((('a', 1), ((0, 2), (1, 30))), (('a', 2), ((0, 2), (1, 73))), (('b', 1), ((0, 'a'), (1, 'd'))), (('b', 2), ((0, False), (1, True)))))
def from_pandas( cls, value: 'pandas.Index', ) -> 'IndexBase': ''' Given a Pandas index, return the appropriate IndexBase derived class. ''' import pandas if not isinstance(value, pandas.Index): raise ErrorInitIndex( f'from_pandas must be called with a Pandas Index object, not: {type(value)}' ) from static_frame import Index from static_frame import IndexGO from static_frame import IndexHierarchy from static_frame import IndexHierarchyGO from static_frame import IndexNanosecond from static_frame import IndexNanosecondGO from static_frame.core.index_datetime import IndexDatetime if isinstance(value, pandas.MultiIndex): # iterating over a hierarchical index will iterate over labels name: tp.Optional[tp.Tuple[tp.Hashable, ...]] = tuple(value.names) # if not assigned Pandas returns None for all components, which will raise issue if trying to unset this index. if all(n is None for n in name): #type: ignore name = None depth = value.nlevels if not cls.STATIC: return IndexHierarchyGO.from_labels(value, name=name, depth_reference=depth) return IndexHierarchy.from_labels(value, name=name, depth_reference=depth) elif isinstance(value, pandas.DatetimeIndex): # if IndexDatetime, use cls, else use IndexNanosecond if issubclass(cls, IndexDatetime): return cls(value, name=value.name) else: if not cls.STATIC: return IndexNanosecondGO(value, name=value.name) return IndexNanosecond(value, name=value.name) if not cls.STATIC: return IndexGO(value, name=value.name) return Index(value, name=value.name)
def test_store_get_field_names_and_dtypes_d(self) -> None: from static_frame.core.index_hierarchy import IndexHierarchy columns = IndexHierarchy.from_labels(((1, 'a'), (1, 'b'), (2, 'c')), name=('foo', 'bar')) f1 = Frame.from_records((('a', True, None), ), index=(('a', )), columns=columns) field_names, dtypes = Store.get_field_names_and_dtypes( frame=f1, include_index=True, include_index_name=False, include_columns=True, include_columns_name=True, ) self.assertEqual(field_names, [('foo', 'bar'), "[1 'a']", "[1 'b']", "[2 'c']"]) self.assertTrue(len(field_names) == len(dtypes)) field_names, dtypes = Store.get_field_names_and_dtypes( frame=f1, include_index=True, include_index_name=False, include_columns=True, include_columns_name=True, force_brackets=True, ) self.assertEqual(field_names, ["['foo' 'bar']", "[1 'a']", "[1 'b']", "[2 'c']"]) with self.assertRaises(StoreParameterConflict): field_names, dtypes = Store.get_field_names_and_dtypes( frame=f1, include_index=True, include_index_name=False, include_columns=True, include_columns_name=False, ) with self.assertRaises(StoreParameterConflict): field_names, dtypes = Store.get_field_names_and_dtypes( frame=f1, include_index=False, include_index_name=False, include_columns=True, include_columns_name=True, )
def test_store_xlsx_read_c(self) -> None: index = IndexHierarchy.from_product(('left', 'right'), ('up', 'down')) columns = IndexHierarchy.from_labels(((100, -5, 20),)) f1 = Frame([1, 2, 3, 4], index=index, columns=columns) with temp_file('.xlsx') as fp: st = StoreXLSX(fp) st.write(((None, f1),), include_index=True, include_columns=False) f2 = st.read(index_depth=f1.index.depth, columns_depth=0) self.assertTrue((f1.values == f2.values).all()) self.assertEqual(f2.to_pairs(0), ((0, ((('left', 'up'), 1), (('left', 'down'), 2), (('right', 'up'), 3), (('right', 'down'), 4))),) )
def test_bus_extract_loc_a(self) -> None: f1 = Frame.from_dict(dict(a=(1, 2), b=(3, 4)), index=('x', 'y'), name='foo') f2 = Frame.from_dict(dict(a=(1, 2, 3), b=(4, 5, 6)), index=('x', 'y', 'z'), name='bar') f3 = Frame.from_dict(dict(d=(10, 20), b=(50, 60)), index=('p', 'q'), name='f3') ih = IndexHierarchy.from_labels((('a', 1), ('b', 2), ('b', 1))) s1 = Series((f1, f2, f3), index=ih, dtype=object) # do not support IndexHierarchy, as lables are tuples, not strings with self.assertRaises(ErrorInitBus): b1 = Bus(s1)
def test_store_xlsx_read_b(self) -> None: index = IndexHierarchy.from_product(('left', 'right'), ('up', 'down')) columns = IndexHierarchy.from_labels(((100, -5, 20), )) f1 = Frame.from_elements([1, 2, 3, 4], index=index, columns=columns) config_map = StoreConfigMap.from_config( StoreConfig(include_index=False, include_columns=True)) with temp_file('.xlsx') as fp: st = StoreXLSX(fp) st.write(((None, f1), ), config=config_map) c = StoreConfig(index_depth=0, columns_depth=f1.columns.depth) f2 = st.read(None, config=c) self.assertTrue((f1.values == f2.values).all()) self.assertEqual(f2.to_pairs(0), (((100, -5, 20), ((0, 1), (1, 2), (2, 3), (3, 4))), ))
def from_concat(cls, containers: tp.Iterable['Series'], *, name: tp.Hashable = None): ''' Concatenate multiple Series into a new Series, assuming the combination of all Indices result in a unique Index. ''' array_values = [] array_index = [] for c in containers: array_values.append(c.values) array_index.append(c.index.values) # returns immutable arrays values = concat_resolved(array_values) index = concat_resolved(array_index) if index.ndim == 2: index = IndexHierarchy.from_labels(index) return cls(values, index=index, name=name)
def read( self, label: tp.Optional[str] = None, *, config: tp.Optional[StoreConfig] = None, store_filter: tp.Optional[StoreFilter] = STORE_FILTER_DEFAULT, container_type: tp.Type[Frame] = Frame, ) -> Frame: ''' Args: label: Name of sheet to read from XLSX. container_type: Type of container to be returned, either Frame or a Frame subclass ''' if config is None: config = StoreConfig() # get default index_depth = config.index_depth index_name_depth_level = config.index_name_depth_level columns_depth = config.columns_depth columns_name_depth_level = config.columns_name_depth_level trim_nadir = config.trim_nadir skip_header = config.skip_header skip_footer = config.skip_footer wb = self._load_workbook(self._fp) if label is None: ws = wb[wb.sheetnames[0]] name = None # do not set to default sheet name else: ws = wb[label] name = ws.title if ws.max_column <= 1 or ws.max_row <= 1: # https://openpyxl.readthedocs.io/en/stable/optimized.html # says that some clients might not report correct dimensions ws.calculate_dimension() max_column = ws.max_column max_row = ws.max_row # adjust for downward shift for skipping header, then reduce for footer; at this value and beyond we stop last_row_count = max_row - skip_header - skip_footer index_values: tp.List[tp.Any] = [] columns_values: tp.List[tp.Any] = [] data = [] # pre-size with None? apex_rows = [] if trim_nadir: mask = np.full((last_row_count, max_column), False) for row_count, row in enumerate(ws.iter_rows(max_row=max_row), start=-skip_header): if row_count < 0: continue # due to skip header; perserves comparison to columns_depth if row_count >= last_row_count: break if trim_nadir: row_data: tp.Sequence[tp.Any] = [] for col_count, c in enumerate(row): if store_filter is None: value = c.value else: value = store_filter.to_type_filter_element(c.value) if value is None: # NOTE: only checking None, not np.nan mask[row_count, col_count] = True row_data.append(value) # type: ignore if not row_data: mask[row_count] = True else: if store_filter is None: row_data = tuple(c.value for c in row) else: # only need to filter string values, but probably too expensive to pre-check row_data = tuple( store_filter.to_type_filter_element(c.value) for c in row) if row_count <= columns_depth - 1: apex_rows.append(row_data[:index_depth]) if columns_depth == 1: columns_values.extend(row_data[index_depth:]) elif columns_depth > 1: columns_values.append(row_data[index_depth:]) continue if index_depth == 0: data.append(row_data) elif index_depth == 1: index_values.append(row_data[0]) data.append(row_data[1:]) else: index_values.append(row_data[:index_depth]) data.append(row_data[index_depth:]) wb.close() #----------------------------------------------------------------------- # Trim all-empty trailing rows created from style formatting GH#146. As the wb is opened in read-only mode, reverse iterating on the wb is not an option, nor is direct row access by integer if trim_nadir: # NOTE: `mask` is all data, while `data` is post index/columns extraction; this means that if a non-None label is found, the row/column will not be trimmed. row_mask = mask.all(axis=1) row_trim_start = array1d_to_last_contiguous_to_edge( row_mask) - columns_depth if row_trim_start < len(row_mask) - columns_depth: data = data[:row_trim_start] if index_depth > 0: # this handles depth 1 and greater index_values = index_values[:row_trim_start] col_mask = mask.all(axis=0) col_trim_start = array1d_to_last_contiguous_to_edge( col_mask) - index_depth if col_trim_start < len(col_mask) - index_depth: data = (r[:col_trim_start] for r in data) #type: ignore if columns_depth == 1: columns_values = columns_values[:col_trim_start] if columns_depth > 1: columns_values = (r[:col_trim_start] for r in columns_values) #type: ignore #----------------------------------------------------------------------- # continue with Index and Frame creation index_name = None if columns_depth == 0 else apex_to_name( rows=apex_rows, depth_level=index_name_depth_level, axis=0, axis_depth=index_depth) index: tp.Optional[IndexBase] = None own_index = False if index_depth == 1: index = Index(index_values, name=index_name) own_index = True elif index_depth > 1: index = IndexHierarchy.from_labels( index_values, continuation_token=None, name=index_name, ) own_index = True columns_name = None if index_depth == 0 else apex_to_name( rows=apex_rows, depth_level=columns_name_depth_level, axis=1, axis_depth=columns_depth) columns: tp.Optional[IndexBase] = None own_columns = False if columns_depth == 1: columns = container_type._COLUMNS_CONSTRUCTOR(columns_values, name=columns_name) own_columns = True elif columns_depth > 1: columns = container_type._COLUMNS_HIERARCHY_CONSTRUCTOR.from_labels( zip(*columns_values), continuation_token=None, name=columns_name, ) own_columns = True return container_type.from_records( data, #type: ignore index=index, columns=columns, dtypes=config.dtypes, own_index=own_index, own_columns=own_columns, name=name, consolidate_blocks=config.consolidate_blocks)
def read( self, label: tp.Optional[str] = None, *, index_depth: int = 1, columns_depth: int = 1, dtypes: DtypesSpecifier = None, store_filter: tp.Optional[StoreFilter] = STORE_FILTER_DEFAULT ) -> Frame: ''' Args: {dtypes} ''' wb = self._load_workbook(self._fp) if label is None: ws = wb[wb.sheetnames[0]] name = None # do not set to default sheet name else: ws = wb[label] name = ws.title if ws.max_column <= 1 or ws.max_row <= 1: # https://openpyxl.readthedocs.io/en/stable/optimized.html # says that some clients might not repare correct dimensions; not sure what conditions are best to show this ws.calculate_dimension() max_column = ws.max_column max_row = ws.max_row index_values: tp.List[tp.Any] = [] columns_values: tp.List[tp.Any] = [] # print() # for row in ws.iter_rows(): # print(tuple(str(c.value).ljust(10) for c in row)) data = [] for row_count, row in enumerate( ws.iter_rows()): # cannot use values_only on 2.5.4 if store_filter is None: row = tuple(c.value for c in row) else: # only need to filter string values, but probably too expensive to pre-check row = tuple( store_filter.to_type_filter_element(c.value) for c in row) if row_count <= columns_depth - 1: if columns_depth == 1: columns_values.extend(row[index_depth:]) elif columns_depth > 1: # NOTE: this orientation will need to be rotated columns_values.append(row[index_depth:]) continue if index_depth == 0: data.append(row) elif index_depth == 1: index_values.append(row[0]) data.append(row[1:]) else: index_values.append(row[:index_depth]) data.append(row[index_depth:]) wb.close() index: tp.Optional[IndexBase] = None own_index = False if index_depth == 1: index = Index(index_values) own_index = True elif index_depth > 1: index = IndexHierarchy.from_labels(index_values, continuation_token=None) own_index = True columns: tp.Optional[IndexBase] = None own_columns = False if columns_depth == 1: columns = Index(columns_values) own_columns = True elif columns_depth > 1: columns = IndexHierarchy.from_labels(zip(*columns_values), continuation_token=None) own_columns = True return tp.cast( Frame, Frame.from_records(data, index=index, columns=columns, dtypes=dtypes, own_index=own_index, own_columns=own_columns, name=name))
def read( self, label: tp.Optional[str] = None, *, config: tp.Optional[StoreConfig] = None, store_filter: tp.Optional[StoreFilter] = STORE_FILTER_DEFAULT, container_type: tp.Type[Frame] = Frame, ) -> Frame: ''' Args: label: Name of sheet to read from XLSX. container_type: Type of container to be returned, either Frame or a Frame subclass ''' if config is None: config = StoreConfig() # get default index_depth = config.index_depth columns_depth = config.columns_depth wb = self._load_workbook(self._fp) if label is None: ws = wb[wb.sheetnames[0]] name = None # do not set to default sheet name else: ws = wb[label] name = ws.title if ws.max_column <= 1 or ws.max_row <= 1: # https://openpyxl.readthedocs.io/en/stable/optimized.html # says that some clients might not repare correct dimensions; not sure what conditions are best to show this ws.calculate_dimension() max_column = ws.max_column max_row = ws.max_row index_values: tp.List[tp.Any] = [] columns_values: tp.List[tp.Any] = [] data = [] # pre-size with None? for row_count, row in enumerate(ws.iter_rows(max_row=max_row)): if store_filter is None: row = tuple(c.value for c in row) else: # only need to filter string values, but probably too expensive to pre-check row = tuple( store_filter.to_type_filter_element(c.value) for c in row) if row_count <= columns_depth - 1: if columns_depth == 1: columns_values.extend(row[index_depth:]) elif columns_depth > 1: # NOTE: this orientation will need to be rotated columns_values.append(row[index_depth:]) continue if index_depth == 0: data.append(row) elif index_depth == 1: index_values.append(row[0]) data.append(row[1:]) else: index_values.append(row[:index_depth]) data.append(row[index_depth:]) wb.close() # Trim all-empty trailing rows created from style formatting GH#146. As the wb is opened in read-only mode, reverse iterating on the wb is not an option, nor is direct row access by integer; alos, evaluating all rows on forward iteration is expensive. Instead, after collecting all the data in a list and closing the wb, reverse iterate and find rows that are all empty. # NOTE: need to handle case where there are valid index values empty_token = (None if store_filter is None else store_filter.to_type_filter_element(None)) for row_count in range(len(data) - 1, -2, -1): if row_count < 0: break if any(c != empty_token for c in data[row_count]): # try to break early with any break if index_depth == 1 and index_values[row_count] != empty_token: break if index_depth > 1 and any(c != empty_token for c in index_values[row_count]): break # row_count is set to the first row that has data or index; can be -1 empty_row_idx = row_count + 1 # index of all-empty row if empty_row_idx != len(data): # trim data and index_values, if index_depth > 0 data = data[:empty_row_idx] if index_depth > 0: index_values = index_values[:empty_row_idx] # continue with Index and Frame creation index: tp.Optional[IndexBase] = None own_index = False if index_depth == 1: index = Index(index_values) own_index = True elif index_depth > 1: index = IndexHierarchy.from_labels(index_values, continuation_token=None) own_index = True columns: tp.Optional[IndexBase] = None own_columns = False if columns_depth == 1: columns = container_type._COLUMNS_CONSTRUCTOR(columns_values) own_columns = True elif columns_depth > 1: columns = container_type._COLUMNS_HIERARCHY_CONSTRUCTOR.from_labels( zip(*columns_values), continuation_token=None) own_columns = True # NOTE: this might be a Frame or a FrameGO return tp.cast( Frame, container_type.from_records( data, index=index, columns=columns, dtypes=config.dtypes, own_index=own_index, own_columns=own_columns, name=name, consolidate_blocks=config.consolidate_blocks))