def bus_aggregate() -> None: import frame_fixtures as ff from itertools import zip_longest # create the "total index" index = sf.IndexDate.from_date_range('2019-01-12', '2019-12-31') # create dummy data index by the total index and with columns src = ff.parse(f's({len(index)},4)').relabel(index=index, columns=tuple('abcd')) chunk_size = 7 window_size = 12 # create a bus by chunking src def items_frames() -> tp.Iterator[sf.Frame]: starts = range(0, len(index), chunk_size) ends = range(starts[1], len(index), chunk_size) for start, end in zip_longest(starts, ends, fillvalue=len(index)): f = src.iloc[start:end] yield f.rename(str(f.index.iloc[0])) bus = sf.Bus.from_frames(items_frames()) # create a series of total index to bus label def items_map() -> tp.Iterator[tp.Tuple[np.datetime64, str]]: for f in bus.values: for dt in f.index: yield dt, f.name ref = sf.Series.from_items(items_map()) # using the ref, we can select and concat from any start, end def get_slice(start: np.datetime64, end: np.datetime64) -> sf.Frame: bus_labels = ref[start:end].unique() return sf.Frame.from_concat( bus[bus_labels].values).loc[start:end] #type: ignore # selecting an arbitrary window fsub = get_slice(index.iloc[3], index.iloc[100]) # creating a batch for windowed processing def items_window() -> tp.Iterator[tp.Tuple[np.datetime64, sf.Frame]]: for window in index.to_series().iter_window(size=window_size): fsub = get_slice(window.values[0], window.values[-1]) #type: ignore yield fsub.index[-1], fsub post = sf.Batch(items_window()).mean().to_frame() # comparing src to batch-extracted windows post_alt = sf.Batch( src.iter_window_items(size=window_size)).mean().to_frame() assert post.equals(post_alt)
def main() -> None: fp = '/home/ariza/Downloads/archive.zip' config = sf.StoreConfig(index_depth=0, label_decoder=int, label_encoder=str) bus = sf.Bus.from_zip_csv(fp, config=config) def normalize(f: sf.Frame) -> sf.Frame: fields = ['country', 'score', 'rank', 'gdp'] def relabel(label: str) -> str: for field in fields: if field in label.lower(): return field return label return f.relabel(columns=relabel)[fields].set_index( fields[0], drop=True) #type: ignore bus = sf.Batch.from_zip_csv(fp, config=config).apply(normalize).to_bus() # selection batch = sf.Batch(bus.items()) quilt = sf.Quilt(bus, axis=0, retain_labels=True)
def stocks() -> None: t = Timer() bus = sf.Bus.from_zip_pickle('/tmp/stocks.zip')[:] print(t, 'done loading') t = Timer() post = sf.Batch(bus.items())[['Open', 'Close']].loc_max().to_frame() print(t, 'serial')
def stocks_write() -> None: t = Timer() d = '/home/ariza/Downloads/archive/Stocks' fps = ((fn, os.path.join(d, fn)) for fn in os.listdir(d)) items = ((fn.replace('.us.txt', ''), sf.Frame.from_csv(fp, index_depth=1)) for fn, fp in fps if os.path.getsize(fp)) sf.Batch(items).to_zip_pickle('/tmp/stocks.zip') print(t)
def bus_batch_streaming() -> None: TypeIterFrameItems = tp.Iterator[tp.Tuple[tp.Hashable, sf.Frame]] # do a bunch of processing on large Frames while maintaining single frame overhead def origin_data() -> TypeIterFrameItems: for label, i in ((chr(i), i) for i in range(65, 75)): # A, B, ... f = sf.Frame(np.arange(100000).reshape(1000, 100) * i, name=label) yield label, f # a StoreConfig is bundle of read/write config in a single object config = sf.StoreConfig(include_index=True, include_columns=True, index_depth=1, columns_depth=1) # incrementally generate and write data one Frame at at ime sf.Batch(origin_data()).to_zip_parquet('/tmp/pq-stg-01.zip', config=config) # we can read the same data store into Bus # settting max_persist to 1 keeps only 1 frame in memory src_bus = sf.Bus.from_zip_parquet('/tmp/pq-stg-01.zip', max_persist=1, config=config) def proc_data(bus: sf.Bus) -> TypeIterFrameItems: for label in bus.keys(): f = bus[label] f_post = f * .00001 yield label, f_post # incrementally read, process data, and write data # note that we do not necessarily need to write this intermediate step; we might chain a number of processors together sf.Batch(proc_data(src_bus)).to_zip_parquet('/tmp/pq-stg-02.zip', config=config) # a function that reads through the derived data and produces single in-memory result def derive_characteristic(bus: sf.Bus) -> sf.Series: def gen() -> tp.Iterator[tp.Tuple[tp.Hashable, float]]: for label in bus.keys(): f = bus[label] yield label, f.mean().mean() return sf.Series.from_items(gen()) post_bus = sf.Bus.from_zip_parquet('/tmp/pq-stg-02.zip', max_persist=1, config=config) print(derive_characteristic(post_bus)) # <Series> # <Index> # A 32.499674999999996 # B 32.99967 # C 33.49966500000001 # D 33.999660000000006 # E 34.499655000000004 # F 34.99965 # G 35.49964500000001 # H 35.99964000000001 # I 36.499635000000005 # J 36.999629999999996 # <<U1> <float64> # if we have family of functions that process items pairs that do not need random access, we can chain them together without writing to an intermediary def proc_data_alt(items: TypeIterFrameItems) -> TypeIterFrameItems: for label, f in items: f_post = f * .00001 yield label, f_post def derive_characteristic_alt(items: TypeIterFrameItems) -> sf.Series: def gen() -> tp.Iterator[tp.Tuple[tp.Hashable, float]]: for label, f in items: yield label, f.mean().mean() return sf.Series.from_items(gen()) # now we can use a Batch to get sequential processing, and start with the origin data set batch = sf.Batch.from_zip_parquet('/tmp/pq-stg-01.zip', config=config) print(derive_characteristic_alt(proc_data_alt(batch.items())))
def bus_batch_demo() -> None: # field definitions # https://www.ndbc.noaa.gov/measdes.shtml # full data set # https://www.ndbc.noaa.gov/view_text_file.php?filename=46222h2018.txt.gz&dir=data/historical/stdmet/ # multi-table storage: XLSX, HDF5, SQLite, zipped bundles # how can we easily read, write, and process these # Batch and the Bus: two series-like containers of Frames # Frames can have different shapes and types: not a Panel # Bus: random access to Frame, writing to multi-table formats, lazy loading # Batch: lazy, sequential processor of Frame, with support for reading/writing to multi-table formats sp2018 = sf.Frame.from_tsv('/tmp/san_pedro.txt') sp2018 = sf.Frame.from_tsv('/tmp/san_pedro.txt', dtypes={'datetime': 'datetime64[m]'}) sp2018.shape # (16824, 5) sp2018 = sp2018.set_index('datetime', index_constructor=sf.IndexMinute, drop=True) # sp2018.loc['2018-01'].shape # (1419, 4) yms = sf.IndexYearMonth.from_year_month_range('2018-01', '2018-12') sp_per_month = [sp2018.loc[ym].rename(str(ym)) for ym in yms] [f.shape for f in sp_per_month] # [(1419, 4), (1295, 4), (1460, 4), (1411, 4), (1431, 4), (1418, 4), (1471, 4), (1160, 4), (1426, 4), (1471, 4), (1420, 4), (1442, 4)] sp_per_month[0].to_xlsx('/tmp/sp_2018_01.xlsx') # [TALK] introduce Bus: how to create a multi-sheet workbook # pandas: pd.ExcelWriter and pd.HDFStore sf.Bus.interface # from frames b1 = sf.Bus.from_frames(sp_per_month) b1.to_xlsx('/tmp/sp_2018.xlsx') # [TALK] to do this in a single line: # but note that this will full-create the Bus before writing sf.Bus.from_frames(sp2018.loc[ym].rename(str(ym)) for ym in yms).to_xlsx('/tmp/sp_2018.xlsx') # [TALK] what other formats can we export to # show interface, constructors and exporters b1.interface b1.to_sqlite('/tmp/sp_2018.sqlite') # [TALK] value of parquet and pickle b1.to_zip_pickle('/tmp/sp_2018.zip') # [TALK] Bus is Series like: can select using loc, iloc, etc # limit in that the index has to be strings: need to use as labels in output formats b1['2018-08'] b1['2018-08':] #type: ignore b1.iloc[[0, 5, 10]] b1[b1.index.isin(('2018-01', '2018-09'))] # [TALK] What about reading: we can lazily read from these output formats sf.Bus.interface b2 = sf.Bus.from_zip_pickle('/tmp/sp_2018.zip') # notice that we have FrameDeferred # when access Frames, the simply get loaded as the are selected: b2['2018-08'] b2 # show only one Frame loaded b2.iloc[[0, 5, 10]] b2 # show we have loaded additional frames # what if you have 600 Frames and you do not want the stored frames to all load b3 = sf.Bus.from_zip_pickle('/tmp/sp_2018.zip', max_persist=2) for label in b3.index: print(b3[label].shape) print(b3) #--------------------------------------------------------------------------- # there are other situations where we need to deal with multiple Frames at as as single unit import pandas as pd df = pd.read_csv('/tmp/san_pedro.txt', sep='\t') df['year_mo'] = df['datetime'].apply(lambda d: d[:7]) # what happens when we do a group-by in Pandas df.groupby('year_mo') # <pandas.core.groupby.generic.DataFrameGroupBy object at 0x7fb3263e7090> # can iterate [k for k, v in df.groupby('year_mo')] # ['2018-01', '2018-02', '2018-03', '2018-04', '2018-05', '2018-06', '2018-07', '2018-08', '2018-09', '2018-10', '2018-11', '2018-12'] # can take the max of each group in one line df.groupby('year_mo')['WVHT'].max() # how does this work: some operations are deferred, some result in containers df.groupby('year_mo')['WVHT'] # <pandas.core.groupby.generic.SeriesGroupBy object at 0x7fb32e1b7fd0> # but what if we want to do the same thing with Frames we created without a groupby, or by some other means? #--------------------------------------------------------------------------- # the Batch batch1 = sf.Batch.from_frames(sp_per_month) # can call to_frame to realize all data with a hierarchical index: sf.Batch.from_frames(sp_per_month).to_frame() # can do the same thing as pandas, but have to explicitly call to_frame() to get a result batch1['WVHT'].max().to_frame() # Batch are like generators: one-time use, one-time iteration. If we look at batch1 again, we see it is empty batch1 # the reason we have to explicitly call to_frame as that there is no limit on how many operations you can do on the Batch; all are deferred until final iteration or `to_frame` call # here, we convert meters to feet before printing getting result (sf.Batch.from_frames(sp_per_month)['WVHT'].max() * 3.28084).to_frame() # what if we want to do this on a group by? # SF's iter_group is simply an iterator of groups, we can feed it into a Batch # lets create a fresh FrameGO and add a year mo column sp2018 = sf.FrameGO.from_tsv('/tmp/san_pedro.txt', dtypes={'datetime': 'datetime64[m]'}) sp2018 = sp2018.set_index('datetime', index_constructor=sf.IndexMinute, drop=False) sp2018['year_mo'] = sp2018['datetime'].astype( 'datetime64[M]') #type: ignore sp2018.iter_group_items('year_mo') # we can feed the iterator of pairs of label, frame to Batch, then process sf.Batch(sp2018.iter_group_items('year_mo'))['WVHT'].max().to_frame() # any time we have iterators of pairs of label, Frame, we can use a Batch # for example, what if we want to iterate on windows tuple(sp2018.iter_window_items(size=100))[0] # We can feed those same pairs into a Batch and get the rolling mean sf.Batch(sp2018.iter_window_items(size=100))['WVHT'].mean().to_frame() # Again, we can convert meters to feet: (sf.Batch(sp2018.iter_window_items(size=100))['WVHT'].mean() * 3.28084).to_frame() # These operations are pretty fast, but we can potential optimize them by using multi threading or multi processing (sf.Batch(sp2018.iter_window_items(size=100), max_workers=6, use_threads=True)['WVHT'].mean() * 3.28084).to_frame() # there is also a generic apply method to perform arbitrary functions sf.Batch(sp2018.iter_window_items(size=100, step=100)).apply( lambda f: f.loc[f['DPD'].loc_max(), ['WVHT', 'DPD']]).to_frame() # what if we want to write read or write from a multi-table format # because the Batch is a lazy sequential processor, this is actually a pipeline that is processing one table at time # memory overhead is one table at a time sf.Batch.from_xlsx('/tmp/sp_2018.xlsx').apply(lambda f: f.assign['WVHT'](f[ 'WVHT'] * 3.28084)).to_xlsx('/tmp/sp_2018_ft.xlsx')