class HDFStorePanel(BaseIO): goal_time = 0.2 def setup(self): self.fname = '__test__.h5' with warnings.catch_warnings(record=True): self.p = Panel(np.random.randn(20, 1000, 25), items=['Item%03d' % i for i in range(20)], major_axis=date_range('1/1/2000', periods=1000), minor_axis=['E%03d' % i for i in range(25)]) self.store = HDFStore(self.fname) self.store.append('p1', self.p) def teardown(self): self.store.close() self.remove(self.fname) def time_read_store_table_panel(self): with warnings.catch_warnings(record=True): self.store.select('p1') def time_write_store_table_panel(self): with warnings.catch_warnings(record=True): self.store.append('p2', self.p)
class HdfStore(DataStore): complevel = 9 complib = "blosc:zstd" def __init__(self, path: str, table: str, compute: Optional[Callable] = None) -> None: self.table = table if compute: self.store = PandasHDFStore(path, complevel=self.complevel, complib=self.complib) dataframe = compute() dataframe.sort_values(by="where", axis=0, inplace=True) self._mangle_where(dataframe) self.store.put( self.table, dataframe, append=False, format="table", expectedrows=len(dataframe), data_columns=[ "where_", "where_type", "who", "who_type", "when", "when_type" ], ) else: self.store = PandasHDFStore(path, complevel=self.complevel, complib=self.complib, mode="r") def query(self, query: str) -> DataFrame: query = self._mangle_where_in_query(query) df = self.store.select(self.table, where=query) self._unmangle_where(df) return df def _mangle_where(self, df: DataFrame) -> None: # See: https://github.com/PyTables/PyTables/issues/638 df.rename(columns={"where": "where_"}, inplace=True) def _unmangle_where(self, df: DataFrame) -> None: # See: https://github.com/PyTables/PyTables/issues/638 df.rename(columns={"where_": "where"}, inplace=True) def _mangle_where_in_query( self, query: Union[str, List[str]]) -> Union[str, List[str]]: # See: https://github.com/PyTables/PyTables/issues/638 if isinstance(query, str): return re.sub("where([^_])", "where_\\1", query) else: return [ self._mangle_where_in_query(subquery) for subquery in query ]
def predict_lm_per_store(data: pd.HDFStore, select_idx: pd.Index, features, sales, save_fit=False): store_train = data.select('train', select_idx, columns=list(features)).set_index(select_idx) assert store_train.shape == (len(select_idx), len(features)) logger.debug('Store train shape {}'.format(store_train.shape)) logger.debug('Sales shape {}'.format(sales.shape)) lm = linear_model.LinearRegression() fit = lm.fit(store_train, sales) pred = fit.predict(store_train) store_train['PredictedSales'] = pred return store_train
def aggregate(hdf_store_loc, file_pattern, headerfile=None, remove_part_files=False): df = None store = HDFStore(hdf_store_loc) store_keys = [w.replace('/', '') for w in store.keys()] print( f'Aggregating part files in {hdf_store_loc} for {file_pattern} into single file' ) for key in store_keys: if re.match(file_pattern.replace('*', '.+'), key): print( f'********************* Key : {key} MAtches pattern : {file_pattern.replace("*",".+")}' ) #thisdf = pd.read_hdf(store_loc, key) thisdf = store.select(key) if df is None: df = thisdf else: #' for gz file that not have headers assign headers. try: df = df.append(thisdf, ignore_index=True, sort=True) except Exception as e: print('Error while joining data {e}') if remove_part_files: store.remove(key) try: #df.to_hdf(store_loc, key=file_pattern.replace('*','')) store.put(key=file_pattern.replace('*', ''), value=df) except Exception as e: print( f'Exception while combining flile for {file_pattern} exception {e}' ) store.close()
def from_one_to_three(table, entity): return [ name for name, column in model.column_by_name.iteritems() if name in table.columns and column.entity == entity ] # on peut en profiter pour faire l'index ici ? Ca tournerait un peu plus vite # mais surtout de maniere plus "essentielle" for year in available_years: print "debut de l annee %s" %year table_in_one = store.select('survey_'+str(year)) # delete some people on every table according to test_ident.py results print len(table_in_one) table_in_one = table_in_one[ - table_in_one['idfam'].isin([700986003, 700202209, 700150006, 700165702, 701609502, 801132105, 802846205, 800571404, 901461205, 800199302, 802008401, 800422201, 802738601, 903972102, 901676301, 900817401])] table_in_one = table_in_one[ - table_in_one['idmen'].isin([8009658,9046607, 8020084, 8001993, 8004222, 8027386, 9039721, 9047848, 9016763]) ] print len(table_in_one) for entity in ['ind','foy','men','fam']: key = 'survey_'+str(year) + '/'+str(entity) vars_entity = from_one_to_three(table_in_one,entity)
class HDFStoreDataFrame(BaseIO): goal_time = 0.2 def setup(self): N = 25000 index = tm.makeStringIndex(N) self.df = DataFrame( { 'float1': np.random.randn(N), 'float2': np.random.randn(N) }, index=index) self.df_mixed = DataFrame( { 'float1': np.random.randn(N), 'float2': np.random.randn(N), 'string1': ['foo'] * N, 'bool1': [True] * N, 'int1': np.random.randint(0, N, size=N) }, index=index) self.df_wide = DataFrame(np.random.randn(N, 100)) self.start_wide = self.df_wide.index[10000] self.stop_wide = self.df_wide.index[15000] self.df2 = DataFrame( { 'float1': np.random.randn(N), 'float2': np.random.randn(N) }, index=date_range('1/1/2000', periods=N)) self.start = self.df2.index[10000] self.stop = self.df2.index[15000] self.df_wide2 = DataFrame(np.random.randn(N, 100), index=date_range('1/1/2000', periods=N)) self.df_dc = DataFrame(np.random.randn(N, 10), columns=['C%03d' % i for i in range(10)]) self.fname = '__test__.h5' self.store = HDFStore(self.fname) self.store.put('fixed', self.df) self.store.put('fixed_mixed', self.df_mixed) self.store.append('table', self.df2) self.store.append('table_mixed', self.df_mixed) self.store.append('table_wide', self.df_wide) self.store.append('table_wide2', self.df_wide2) def teardown(self): self.store.close() self.remove(self.fname) def time_read_store(self): self.store.get('fixed') def time_read_store_mixed(self): self.store.get('fixed_mixed') def time_write_store(self): self.store.put('fixed_write', self.df) def time_write_store_mixed(self): self.store.put('fixed_mixed_write', self.df_mixed) def time_read_store_table_mixed(self): self.store.select('table_mixed') def time_write_store_table_mixed(self): self.store.append('table_mixed_write', self.df_mixed) def time_read_store_table(self): self.store.select('table') def time_write_store_table(self): self.store.append('table_write', self.df) def time_read_store_table_wide(self): self.store.select('table_wide') def time_write_store_table_wide(self): self.store.append('table_wide_write', self.df_wide) def time_write_store_table_dc(self): self.store.append('table_dc_write', self.df_dc, data_columns=True) def time_query_store_table_wide(self): self.store.select('table_wide', where="index > self.start_wide and " "index < self.stop_wide") def time_query_store_table(self): self.store.select('table', where="index > self.start and " "index < self.stop") def time_store_repr(self): repr(self.store) def time_store_str(self): str(self.store) def time_store_info(self): self.store.info()
def test_multiple_open_close(setup_path): # gh-4409: open & close multiple times with ensure_clean_path(setup_path) as path: df = tm.makeDataFrame() df.to_hdf(path, "df", mode="w", format="table") # single store = HDFStore(path) assert "CLOSED" not in store.info() assert store.is_open store.close() assert "CLOSED" in store.info() assert not store.is_open with ensure_clean_path(setup_path) as path: if pytables._table_file_open_policy_is_strict: # multiples store1 = HDFStore(path) msg = ( r"The file [\S]* is already opened\. Please close it before " r"reopening in write mode\." ) with pytest.raises(ValueError, match=msg): HDFStore(path) store1.close() else: # multiples store1 = HDFStore(path) store2 = HDFStore(path) assert "CLOSED" not in store1.info() assert "CLOSED" not in store2.info() assert store1.is_open assert store2.is_open store1.close() assert "CLOSED" in store1.info() assert not store1.is_open assert "CLOSED" not in store2.info() assert store2.is_open store2.close() assert "CLOSED" in store1.info() assert "CLOSED" in store2.info() assert not store1.is_open assert not store2.is_open # nested close store = HDFStore(path, mode="w") store.append("df", df) store2 = HDFStore(path) store2.append("df2", df) store2.close() assert "CLOSED" in store2.info() assert not store2.is_open store.close() assert "CLOSED" in store.info() assert not store.is_open # double closing store = HDFStore(path, mode="w") store.append("df", df) store2 = HDFStore(path) store.close() assert "CLOSED" in store.info() assert not store.is_open store2.close() assert "CLOSED" in store2.info() assert not store2.is_open # ops on a closed store with ensure_clean_path(setup_path) as path: df = tm.makeDataFrame() df.to_hdf(path, "df", mode="w", format="table") store = HDFStore(path) store.close() msg = r"[\S]* file is not open!" with pytest.raises(ClosedFileError, match=msg): store.keys() with pytest.raises(ClosedFileError, match=msg): "df" in store with pytest.raises(ClosedFileError, match=msg): len(store) with pytest.raises(ClosedFileError, match=msg): store["df"] with pytest.raises(ClosedFileError, match=msg): store.select("df") with pytest.raises(ClosedFileError, match=msg): store.get("df") with pytest.raises(ClosedFileError, match=msg): store.append("df2", df) with pytest.raises(ClosedFileError, match=msg): store.put("df3", df) with pytest.raises(ClosedFileError, match=msg): store.get_storer("df2") with pytest.raises(ClosedFileError, match=msg): store.remove("df2") with pytest.raises(ClosedFileError, match=msg): store.select("df") msg = "'HDFStore' object has no attribute 'df'" with pytest.raises(AttributeError, match=msg): store.df
def test_frame_select_complex2(setup_path): with ensure_clean_path(["params.hdf", "hist.hdf"]) as paths: pp, hh = paths # use non-trivial selection criteria params = DataFrame({"A": [1, 1, 2, 2, 3]}) params.to_hdf(pp, "df", mode="w", format="table", data_columns=["A"]) selection = read_hdf(pp, "df", where="A=[2,3]") hist = DataFrame( np.random.randn(25, 1), columns=["data"], index=MultiIndex.from_tuples([(i, j) for i in range(5) for j in range(5)], names=["l1", "l2"]), ) hist.to_hdf(hh, "df", mode="w", format="table") expected = read_hdf(hh, "df", where="l1=[2, 3, 4]") # scope with list like l0 = selection.index.tolist() # noqa:F841 store = HDFStore(hh) result = store.select("df", where="l1=l0") tm.assert_frame_equal(result, expected) store.close() result = read_hdf(hh, "df", where="l1=l0") tm.assert_frame_equal(result, expected) # index index = selection.index # noqa:F841 result = read_hdf(hh, "df", where="l1=index") tm.assert_frame_equal(result, expected) result = read_hdf(hh, "df", where="l1=selection.index") tm.assert_frame_equal(result, expected) result = read_hdf(hh, "df", where="l1=selection.index.tolist()") tm.assert_frame_equal(result, expected) result = read_hdf(hh, "df", where="l1=list(selection.index)") tm.assert_frame_equal(result, expected) # scope with index store = HDFStore(hh) result = store.select("df", where="l1=index") tm.assert_frame_equal(result, expected) result = store.select("df", where="l1=selection.index") tm.assert_frame_equal(result, expected) result = store.select("df", where="l1=selection.index.tolist()") tm.assert_frame_equal(result, expected) result = store.select("df", where="l1=list(selection.index)") tm.assert_frame_equal(result, expected) store.close()
class HDFStoreDataFrame(BaseIO): def setup(self): N = 25000 index = tm.makeStringIndex(N) self.df = DataFrame( {"float1": np.random.randn(N), "float2": np.random.randn(N)}, index=index ) self.df_mixed = DataFrame( { "float1": np.random.randn(N), "float2": np.random.randn(N), "string1": ["foo"] * N, "bool1": [True] * N, "int1": np.random.randint(0, N, size=N), }, index=index, ) self.df_wide = DataFrame(np.random.randn(N, 100)) self.start_wide = self.df_wide.index[10000] self.stop_wide = self.df_wide.index[15000] self.df2 = DataFrame( {"float1": np.random.randn(N), "float2": np.random.randn(N)}, index=date_range("1/1/2000", periods=N), ) self.start = self.df2.index[10000] self.stop = self.df2.index[15000] self.df_wide2 = DataFrame( np.random.randn(N, 100), index=date_range("1/1/2000", periods=N) ) self.df_dc = DataFrame( np.random.randn(N, 10), columns=["C%03d" % i for i in range(10)] ) self.fname = "__test__.h5" self.store = HDFStore(self.fname) self.store.put("fixed", self.df) self.store.put("fixed_mixed", self.df_mixed) self.store.append("table", self.df2) self.store.append("table_mixed", self.df_mixed) self.store.append("table_wide", self.df_wide) self.store.append("table_wide2", self.df_wide2) def teardown(self): self.store.close() self.remove(self.fname) def time_read_store(self): self.store.get("fixed") def time_read_store_mixed(self): self.store.get("fixed_mixed") def time_write_store(self): self.store.put("fixed_write", self.df) def time_write_store_mixed(self): self.store.put("fixed_mixed_write", self.df_mixed) def time_read_store_table_mixed(self): self.store.select("table_mixed") def time_write_store_table_mixed(self): self.store.append("table_mixed_write", self.df_mixed) def time_read_store_table(self): self.store.select("table") def time_write_store_table(self): self.store.append("table_write", self.df) def time_read_store_table_wide(self): self.store.select("table_wide") def time_write_store_table_wide(self): self.store.append("table_wide_write", self.df_wide) def time_write_store_table_dc(self): self.store.append("table_dc_write", self.df_dc, data_columns=True) def time_query_store_table_wide(self): self.store.select( "table_wide", where="index > self.start_wide and index < self.stop_wide" ) def time_query_store_table(self): self.store.select("table", where="index > self.start and index < self.stop") def time_store_repr(self): repr(self.store) def time_store_str(self): str(self.store) def time_store_info(self): self.store.info()
#!/usr/bin/env python import pandas as pd from pandas import HDFStore, bdate_range from pandas.tseries.offsets import BDay crsp = HDFStore('/home/chad/WrdsData/hdf/crsp/crsp.h5') famafrench = HDFStore('/home/chad/WrdsData/hdf/famafrench/famafrench.h5') DAILY_FACTORS = famafrench.select('/famafrench/factors_daily') class Event(object): def __init__(self, id, evt_date, gap=5, est_period=252, frequency='B', evt_start=-2, evt_end=2): self._id = id self.evt_date = pd.to_datetime(evt_date) self.frequency = frequency self._has_data = self._has_models = False self.evt_window = bdate_range(start=evt_date - BDay(abs(evt_start)), end=evt_date + BDay(evt_end)) self.est_period = bdate_range(end=evt_date - BDay(abs(evt_start - gap)), periods=est_period) def run_study(self):
def glm_predictions(data: pd.HDFStore, output: pd.HDFStore, model_save_dir=None, predict_train=True, from_saved_model=False): # +- test_set_stores = data.select_column('test', 'Store').unique() ## if from_saved_model: if from_saved_model is True: glm = get_saved_glm_model(model_save_dir) else: glm = get_saved_glm_model(from_saved_model) else: ## logger.info("Dropping store data before changepoint.") select_idx = remove_before_changepoint(data, None) logger.info("Reduced to {0}".format(len(select_idx))) ## logger.info("Dropping stores not in test set. Initial shape") idx = data.select_as_coordinates('train', 'Store in test_set_stores') select_idx = select_idx.intersection(idx) logger.info("Reduced to {0}".format(len(select_idx))) ## logger.debug("Log transform on sales data") idx = data.select_as_coordinates('train', 'Sales > 0') select_idx = select_idx.intersection(idx) with warnings_to_log('divide by zero'): data.put('train_logsales', np.log(data.select('train', 'columns = Sales')), data_columns=True) logger.info("Reduced to {0}".format(len(select_idx))) ## select_idx = remove_outliers_lm(data, select_idx, log_lm_features, test_set_stores) logger.info("Removed outliers, reduced shape {0}".format( len(select_idx))) ## logger.info("Running glm training") X = DataFromHDF(data_store=data, key='train', select_idx=select_idx, columns=linear_features) y = DataFromHDF(data_store=data, key='train_logsales', select_idx=select_idx, column='Sales') glm = GLMPredictions(stores=test_set_stores, steps=15, step_by=3) glm.fit(X, y) ## if model_save_dir: glm.save_model(model_save_dir) ## logger.info("glm predictions on test set") X = DataFromHDF(data_store=data, key='test', columns=linear_features) glm_output = DataFromHDF(data_store=output, key='test/glm', data_columns=True) preds = glm.predict(X) glm_output.put(preds) ## if predict_train: logger.info("glm predictions on training set") X = DataFromHDF(data_store=data, key='train', columns=linear_features) glm_output = DataFromHDF(data_store=output, key='train/glm', data_columns=True) preds = glm.predict(X) glm_output.put(preds)
def xgb_predictions(data: pd.HDFStore, output: pd.HDFStore, model_save_dir=None, predict_train=True, from_saved_model=False): # +- ## # noinspection PyUnusedLocal test_set_stores = data.select_column('test', 'Store').unique() if from_saved_model: if from_saved_model is True: xgb = get_saved_xgb_model(model_save_dir) else: xgb = get_saved_xgb_model(from_saved_model) else: logger.info("Dropping store data before changepoint.") select_idx = remove_before_changepoint(data, None) logger.info("Reduced to {0}".format(len(select_idx))) ## logger.info("Dropping stores not in test set. Initial shape") idx = data.select_as_coordinates('train', 'Store in test_set_stores') select_idx = select_idx.intersection(idx) logger.info("Reduced to {0}".format(len(select_idx))) ## logger.debug("Log transform on sales data") idx = data.select_as_coordinates('train', 'Sales > 0') select_idx = select_idx.intersection(idx) with warnings_to_log('divide by zero'): data.put('train_logsales', np.log(data.select('train', 'columns = Sales')), data_columns=True) logger.info("Reduced to {0}".format(len(select_idx))) ## logger.info("Running xgboost training") X = DataFromHDF(data_store=data, key='train', select_idx=select_idx, columns=xgb_features) y = DataFromHDF(data_store=data, key='train_logsales', select_idx=select_idx, column='Sales') xgb = XGBPredictions(eval_function=xgb_expm1_rmspe, params=xparams, nrounds=3000) xgb.fit(X, y) ## if model_save_dir: xgb.save_model(model_save_dir) ## logger.info("xgboost predictions on test set") X = DataFromHDF(data_store=data, key='test', columns=xgb_features) xgb_output = DataFromHDF(data_store=output, key='test/xgb', data_columns=True) preds = xgb.predict(X) xgb_output.put(preds) ## if predict_train: logger.info("xgboost predictions on training set") xgb_output = DataFromHDF(data_store=output, key='train/xgb', data_columns=True) select_idx = data.select_as_coordinates('train', 'Store in test_set_stores') X = DataFromHDF(data_store=data, key='train', select_idx=select_idx, columns=xgb_features) predict_in_chunks(xgb, X, xgb_output)
store.put('df', df, data_columns=True, format='table') print df store.close() # store['df'] # load it # # Read hdf5 by chunks # https://towardsdatascience.com/why-and-how-to-use-pandas-with-large-data-9594dda2ea4c # https://stackoverflow.com/questions/40348945/reading-data-by-chunking-with-hdf5-and-pandas rd_store = HDFStore(fn) # # df = pd.DataFrame(columns=columns) # chunksize = 4096 # # %%timeit # # for chunk in pd.read_hdf(fn, 'df', chunksize=chunksize, where='h_m < 5.3'): # # df = pd.concat([df, chunk], ignore_index=True) # # # sel by time # # https://stackoverflow.com/questions/25681308/pandas-read-hdf-query-by-date-and-time-range # # Может лучше не таблицей хранить если выбирать по времени c = rd_store.select_column('df', 'timeticket') where = pd.DatetimeIndex(c).indexer_between_time('12:00', '16:56') # resp = rd_store.select('df', where=where) # print resp.info() print resp # Another selection # https://stackoverflow.com/questions/20502996/use-or-in-hdfstore-select-pandas