class TestHDFWrapper(unittest.TestCase): def setUp(self): self.fdir = os.path.join('.', 'test_files', 'panel') months = ['1994_01', '1994_02', '1994_03'] frequency = 'monthly' self.handler = HDFHandler('./test_files', 'panel', months, frequency) def test_file_creation(self): # _ = self.handler._select_stores(self.handler) expected = [ os.path.join('test_files', 'panel', x) for x in ('m1994_01.h5', 'm1994_02.h5', 'm1994_03.h5') ] print(os.listdir('test_files')) assert all([os.path.exists(x) for x in expected]) def test_create_from_list(self): months = [['1994_01', '1994_02', '1994_03']] frequency = 'Q' handler = HDFHandler('./test_files', kind='panel', months=months, frequency=frequency) self.assertEqual(handler.stores.keys(), ['long_1994_Q1']) handler.close() def test_getitem(self): result = self.handler['m1994_01'] expected = self.handler.stores['m1994_01'] self.assertIs(result, expected) def test_write(self): df = pd.DataFrame({'A': [1, 2, 3]}) self.handler.write(df, 'm1994_01', format='f', append=False) res = self.handler.select('m1994_01') tm.assert_frame_equal(df, res) def test_iter(self): result = [x for x in self.handler] expected = ['m1994_01', 'm1994_02', 'm1994_03'] self.assertEqual(result, expected) # def test_select_all(self): # import ipdb; ipdb.set_trace() # h = HDFHandler(self.settings, 'panel', frequency='monthly') # assert len(h.stores) == 3 def test_sanitize_key(self): result = self.handler._sanitize_key('1994-01') expected = 'm1994_01' self.assertEqual(result, expected) result = self.handler._sanitize_key('1994_01') self.assertEqual(result, expected) result = self.handler._sanitize_key('m1994-01') self.assertEqual(result, expected) # With a different pre settings = {'base_path': './test_files/'} months = [['1994_01', '1994_02', '1994_03']] frequency = 'Q' # handler = HDFHandler(settings, kind='long', months=months, # frequency=frequency) # result = handler._sanitize_key('long_1996_Q1') # print(handler.pre) # expected = "long_1996_Q1" # self.assertEqual(result, expected) def test_iteritems(self): g = self.handler.iteritems() name, value = next(g) self.assertEqual(name, 'm1994_01') self.assertIs(value, None) # getting a `is not a regular file` error. def test_from_directory(self): # setup should have created some. os.mkdir('from_dir') with open(os.path.join('from_dir', 'file5.h5'), 'w'): pass try: handler = HDFHandler.from_directory('from_dir', kind='M') self.assertEqual(len(handler.stores), 1) handler.close() except Exception as e: print(e) finally: os.remove(os.path.join('from_dir', 'file5.h5')) os.removedirs('from_dir') # def test_map(self): # res = self.handler.map(sum) def test_apply(self): # agg, groupby=None, level='stamp' _write_df_to_handler(self) result = self.handler.apply('mean', level='stamp', selector='A') expected = pd.Series([2.5, 2.5, 2.5], name='A', index=pd.to_datetime( ['1994-01-01', '1994-02-01', '1994-03-01'])) tm.assert_series_equal(result, expected) # agg, groupby=None, level='stamp', list selector result = self.handler.apply('mean', level='stamp', selector=['A']) expected = pd.DataFrame(expected) expected.index.names = ['stamp'] tm.assert_frame_equal(result, expected) # with select kwargs result = self.handler.apply('mean', level='stamp', selector=['A'], select_kwargs={'columns': ['A']}) expected = pd.DataFrame(expected) expected.index.names = ['stamp'] tm.assert_frame_equal(result, expected) # agg, groupby='B', level='None' # will fail # result = self.handler.apply('mean', groupby='B', selector=['A']) # expected = pd.Series([1.5, 3.5], name='A', index=['a', 'b']) # tm.assert_series_equal(result, expected) # a list of aggs result = self.handler.apply(['mean', 'count'], level='stamp', selector=['A']) expected = pd.DataFrame( [[2.5, 4], [2.5, 4], [2.5, 4]], index=pd.to_datetime(['1994-01-01', '1994-02-01', '1994-03-01'])) expected.index.name = 'stamp' expected.columns = pd.MultiIndex.from_tuples([('A', 'mean'), ('A', 'count')]) tm.assert_frame_equal(result, expected) def test_select(self): df = pd.DataFrame({'A': [1, 2, 3], 'B': ['a', 'b', 'c']}) self.handler.write(df, 'm1994_01', format='table', append=False) # actual test result = self.handler.select('m1994_01') tm.assert_frame_equal(result, df) result = self.handler.select('m1994_01', columns=['A']) tm.assert_frame_equal(result, df[['A']]) def test_select_all(self): df = _write_df_to_handler(self, return_df=True).sort_index() # actual test result = self.handler.select_all() tm.assert_frame_equal(result, df) def tearDown(self): self.handler.close() def unlink(subdir): for f in os.listdir(os.path.join('test_files', subdir)): file_path = os.path.join('.', 'test_files', subdir, f) try: if os.path.isfile(file_path): os.unlink(file_path) except OSError: pass except Exception as e: print(e) os.rmdir(os.path.join('test_files', subdir)) unlink('panel') # unlink('long') os.rmdir('test_files')
def make_to_long(panel_h, settings, start=None, stop=None): """ Let's chunk by quarters. """ # need compensation for real wage with open('../panel_construction/settings.txt', 'rt') as f: settings = json.load(f) analyzed = pd.HDFStore(settings['analyzed_path']) comp = analyzed.select('bls_productivity_compensation')['compensation'] prod = analyzed.select('bls_productivity_compensation')['productivity'] keys = sorted(panel_h.stores.keys()) m0 = start or keys[0] m0 = date_parser(m0) mn = stop or keys[-1] mn = date_parser(mn) months = [ x.strftime('%Y_%m') for x in arrow.Arrow.range('month', m0, mn) if x.strftime('m%Y_%m') in keys ] # Getting some memory pressure. break into chunks, write each out. # read proccessed chucnks. # Chunking by quarter month_chunks = chunk_quarters(months, 3) month_chunks = [x for x in month_chunks if len(x) > 0] p = pathlib.Path(str(settings['base_path'])) out_store = HDFHandler(str(p), kind='long', months=month_chunks, frequency='Q') earn_store = HDFHandler(str(p), kind='earn', months=month_chunks, frequency='Q') for chunk in month_chunks: # need the three month chunks... maybe zip up with out_stoure. # may need another dict. df = read_to_long(panel_h, chunk) name = make_chunk_name(chunk) # out_store.write(df, name, format='table', append=False) s = out_store.stores[name] # add in real hourly wage c = comp.reindex(df.index, level='stamp').fillna(method='ffill') / 100 # adjust weight decimals df.loc[:, 'og_weight'] = df['og_weight'] / 10000 # CPS reports earnings in cents df.loc[:, 'earnings'] = df['earnings'] / 100 df['real_hr_earns'] = (df['earnings'] / df['hours']) / c df['real_hr_earns'] = df['real_hr_earns'].replace(np.inf, np.nan) # div by 0 df = replace_categorical(df, kind='flow', inverse=True) with pd.get_store(s.filename) as store: df.to_hdf(store, name, format='table', append=False) #---------------------------------------------------------------- # Also write out just earnings (nan issues so can't select later) # need to make real hrs fisrt. earn = df[~pd.isnull(df.real_hr_earns)] earn = earn[(earn.hours > 0) & (earn.earnings > 0)] s = earn_store.stores[name] with pd.get_store(s.filename) as store: earn.to_hdf(store, name, format='table', append=False, data_columns=True) print("Finished " + str(chunk)) # finally, chunk by quarter and write out. df = earn_store.select_all().drop(['occupation', 'actual_hours'], axis=1) df = df.dropna(how='any', subset=['edu', 'age', 'flow', 'expr']) df = quarterize(df) df['productivity'] = prod.reindex(df.index, level='qmonth') df['real_hr_earns'] = df.real_hr_earns.replace(np.inf, np.nan) # df = add_demo_dummies(df) # model, res = construct_wage_index(df) # df.loc[:, 'wage_index_res'] = res.resid cln_path = '/Volumes/HDD/Users/tom/DataStorage/CPS/analyzed/clean.h5' with pd.get_store(cln_path) as store: df.to_hdf(store, 'cleaned', format='f', append=False) out_store.close() analyzed.close() earn_store.close()
class TestHDFWrapper(unittest.TestCase): def setUp(self): self.fdir = os.path.join('.', 'test_files', 'panel') months = ['1994_01', '1994_02', '1994_03'] frequency = 'monthly' self.handler = HDFHandler('./test_files', 'panel', months, frequency) def test_file_creation(self): # _ = self.handler._select_stores(self.handler) expected = [os.path.join('test_files', 'panel', x) for x in ('m1994_01.h5', 'm1994_02.h5', 'm1994_03.h5')] print(os.listdir('test_files')) assert all([os.path.exists(x) for x in expected]) def test_create_from_list(self): months = [['1994_01', '1994_02', '1994_03']] frequency = 'Q' handler = HDFHandler('./test_files', kind='panel', months=months, frequency=frequency) self.assertEqual(handler.stores.keys(), ['long_1994_Q1']) handler.close() def test_getitem(self): result = self.handler['m1994_01'] expected = self.handler.stores['m1994_01'] self.assertIs(result, expected) def test_write(self): df = pd.DataFrame({'A': [1, 2, 3]}) self.handler.write(df, 'm1994_01', format='f', append=False) res = self.handler.select('m1994_01') tm.assert_frame_equal(df, res) def test_iter(self): result = [x for x in self.handler] expected = ['m1994_01', 'm1994_02', 'm1994_03'] self.assertEqual(result, expected) # def test_select_all(self): # import ipdb; ipdb.set_trace() # h = HDFHandler(self.settings, 'panel', frequency='monthly') # assert len(h.stores) == 3 def test_sanitize_key(self): result = self.handler._sanitize_key('1994-01') expected = 'm1994_01' self.assertEqual(result, expected) result = self.handler._sanitize_key('1994_01') self.assertEqual(result, expected) result = self.handler._sanitize_key('m1994-01') self.assertEqual(result, expected) # With a different pre settings = {'base_path': './test_files/'} months = [['1994_01', '1994_02', '1994_03']] frequency = 'Q' # handler = HDFHandler(settings, kind='long', months=months, # frequency=frequency) # result = handler._sanitize_key('long_1996_Q1') # print(handler.pre) # expected = "long_1996_Q1" # self.assertEqual(result, expected) def test_iteritems(self): g = self.handler.iteritems() name, value = next(g) self.assertEqual(name, 'm1994_01') self.assertIs(value, None) # getting a `is not a regular file` error. def test_from_directory(self): # setup should have created some. os.mkdir('from_dir') with open(os.path.join('from_dir', 'file5.h5'), 'w'): pass try: handler = HDFHandler.from_directory('from_dir', kind='M') self.assertEqual(len(handler.stores), 1) handler.close() except Exception as e: print(e) finally: os.remove(os.path.join('from_dir', 'file5.h5')) os.removedirs('from_dir') # def test_map(self): # res = self.handler.map(sum) def test_apply(self): # agg, groupby=None, level='stamp' _write_df_to_handler(self) result = self.handler.apply('mean', level='stamp', selector='A') expected = pd.Series([2.5, 2.5, 2.5], name='A', index=pd.to_datetime(['1994-01-01', '1994-02-01', '1994-03-01'])) tm.assert_series_equal(result, expected) # agg, groupby=None, level='stamp', list selector result = self.handler.apply('mean', level='stamp', selector=['A']) expected = pd.DataFrame(expected) expected.index.names = ['stamp'] tm.assert_frame_equal(result, expected) # with select kwargs result = self.handler.apply('mean', level='stamp', selector=['A'], select_kwargs={'columns': ['A']}) expected = pd.DataFrame(expected) expected.index.names = ['stamp'] tm.assert_frame_equal(result, expected) # agg, groupby='B', level='None' # will fail # result = self.handler.apply('mean', groupby='B', selector=['A']) # expected = pd.Series([1.5, 3.5], name='A', index=['a', 'b']) # tm.assert_series_equal(result, expected) # a list of aggs result = self.handler.apply(['mean', 'count'], level='stamp', selector=['A']) expected = pd.DataFrame([[2.5, 4], [2.5, 4], [2.5, 4]], index=pd.to_datetime(['1994-01-01', '1994-02-01', '1994-03-01'])) expected.index.name = 'stamp' expected.columns = pd.MultiIndex.from_tuples([('A', 'mean'), ('A', 'count')]) tm.assert_frame_equal(result, expected) def test_select(self): df = pd.DataFrame({'A': [1, 2, 3], 'B': ['a', 'b', 'c']}) self.handler.write(df, 'm1994_01', format='table', append=False) # actual test result = self.handler.select('m1994_01') tm.assert_frame_equal(result, df) result = self.handler.select('m1994_01', columns=['A']) tm.assert_frame_equal(result, df[['A']]) def test_select_all(self): df = _write_df_to_handler(self, return_df=True).sort_index() # actual test result = self.handler.select_all() tm.assert_frame_equal(result, df) def tearDown(self): self.handler.close() def unlink(subdir): for f in os.listdir(os.path.join('test_files', subdir)): file_path = os.path.join('.', 'test_files', subdir, f) try: if os.path.isfile(file_path): os.unlink(file_path) except OSError: pass except Exception as e: print(e) os.rmdir(os.path.join('test_files', subdir)) unlink('panel') # unlink('long') os.rmdir('test_files')
def make_to_long(panel_h, settings, start=None, stop=None): """ Let's chunk by quarters. """ # need compensation for real wage with open('../panel_construction/settings.txt', 'rt') as f: settings = json.load(f) analyzed = pd.HDFStore(settings['analyzed_path']) comp = analyzed.select('bls_productivity_compensation')['compensation'] prod = analyzed.select('bls_productivity_compensation')['productivity'] keys = sorted(panel_h.stores.keys()) m0 = start or keys[0] m0 = date_parser(m0) mn = stop or keys[-1] mn = date_parser(mn) months = [x.strftime('%Y_%m') for x in arrow.Arrow.range('month', m0, mn) if x.strftime('m%Y_%m') in keys] # Getting some memory pressure. break into chunks, write each out. # read proccessed chucnks. # Chunking by quarter month_chunks = chunk_quarters(months, 3) month_chunks = [x for x in month_chunks if len(x) > 0] p = pathlib.Path(str(settings['base_path'])) out_store = HDFHandler(str(p), kind='long', months=month_chunks, frequency='Q') earn_store = HDFHandler(str(p), kind='earn', months=month_chunks, frequency='Q') for chunk in month_chunks: # need the three month chunks... maybe zip up with out_stoure. # may need another dict. df = read_to_long(panel_h, chunk) name = make_chunk_name(chunk) # out_store.write(df, name, format='table', append=False) s = out_store.stores[name] # add in real hourly wage c = comp.reindex(df.index, level='stamp').fillna(method='ffill') / 100 # adjust weight decimals df.loc[:, 'og_weight'] = df['og_weight'] / 10000 # CPS reports earnings in cents df.loc[:, 'earnings'] = df['earnings'] / 100 df['real_hr_earns'] = (df['earnings'] / df['hours']) / c df['real_hr_earns'] = df['real_hr_earns'].replace(np.inf, np.nan) # div by 0 df = replace_categorical(df, kind='flow', inverse=True) with pd.get_store(s.filename) as store: df.to_hdf(store, name, format='table', append=False) #---------------------------------------------------------------- # Also write out just earnings (nan issues so can't select later) # need to make real hrs fisrt. earn = df[~pd.isnull(df.real_hr_earns)] earn = earn[(earn.hours > 0) & (earn.earnings > 0)] s = earn_store.stores[name] with pd.get_store(s.filename) as store: earn.to_hdf(store, name, format='table', append=False, data_columns=True) print("Finished " + str(chunk)) # finally, chunk by quarter and write out. df = earn_store.select_all().drop(['occupation', 'actual_hours'], axis=1) df = df.dropna(how='any', subset=['edu', 'age', 'flow', 'expr']) df = quarterize(df) df['productivity'] = prod.reindex(df.index, level='qmonth') df['real_hr_earns'] = df.real_hr_earns.replace(np.inf, np.nan) # df = add_demo_dummies(df) # model, res = construct_wage_index(df) # df.loc[:, 'wage_index_res'] = res.resid cln_path = '/Volumes/HDD/Users/tom/DataStorage/CPS/analyzed/clean.h5' with pd.get_store(cln_path) as store: df.to_hdf(store, 'cleaned', format='f', append=False) out_store.close() analyzed.close() earn_store.close()