예제 #1
0
class TestHDFWrapper(unittest.TestCase):
    def setUp(self):
        self.fdir = os.path.join('.', 'test_files', 'panel')
        months = ['1994_01', '1994_02', '1994_03']
        frequency = 'monthly'
        self.handler = HDFHandler('./test_files', 'panel', months, frequency)

    def test_file_creation(self):
        # _ = self.handler._select_stores(self.handler)
        expected = [
            os.path.join('test_files', 'panel', x)
            for x in ('m1994_01.h5', 'm1994_02.h5', 'm1994_03.h5')
        ]
        print(os.listdir('test_files'))

        assert all([os.path.exists(x) for x in expected])

    def test_create_from_list(self):
        months = [['1994_01', '1994_02', '1994_03']]
        frequency = 'Q'
        handler = HDFHandler('./test_files',
                             kind='panel',
                             months=months,
                             frequency=frequency)
        self.assertEqual(handler.stores.keys(), ['long_1994_Q1'])
        handler.close()

    def test_getitem(self):
        result = self.handler['m1994_01']
        expected = self.handler.stores['m1994_01']
        self.assertIs(result, expected)

    def test_write(self):
        df = pd.DataFrame({'A': [1, 2, 3]})
        self.handler.write(df, 'm1994_01', format='f', append=False)
        res = self.handler.select('m1994_01')
        tm.assert_frame_equal(df, res)

    def test_iter(self):
        result = [x for x in self.handler]
        expected = ['m1994_01', 'm1994_02', 'm1994_03']
        self.assertEqual(result, expected)

    # def test_select_all(self):
    #     import ipdb; ipdb.set_trace()
    #     h = HDFHandler(self.settings, 'panel', frequency='monthly')
    #     assert len(h.stores) == 3

    def test_sanitize_key(self):
        result = self.handler._sanitize_key('1994-01')
        expected = 'm1994_01'
        self.assertEqual(result, expected)

        result = self.handler._sanitize_key('1994_01')
        self.assertEqual(result, expected)

        result = self.handler._sanitize_key('m1994-01')
        self.assertEqual(result, expected)

        # With a different pre
        settings = {'base_path': './test_files/'}
        months = [['1994_01', '1994_02', '1994_03']]
        frequency = 'Q'
        # handler = HDFHandler(settings, kind='long', months=months,
        #                      frequency=frequency)
        # result = handler._sanitize_key('long_1996_Q1')
        # print(handler.pre)
        # expected = "long_1996_Q1"
        # self.assertEqual(result, expected)

    def test_iteritems(self):
        g = self.handler.iteritems()
        name, value = next(g)
        self.assertEqual(name, 'm1994_01')
        self.assertIs(value, None)

    # getting a `is not a regular file` error.
    def test_from_directory(self):
        # setup should have created some.
        os.mkdir('from_dir')
        with open(os.path.join('from_dir', 'file5.h5'), 'w'):
            pass
        try:
            handler = HDFHandler.from_directory('from_dir', kind='M')
            self.assertEqual(len(handler.stores), 1)
            handler.close()
        except Exception as e:
            print(e)
        finally:
            os.remove(os.path.join('from_dir', 'file5.h5'))
            os.removedirs('from_dir')

    # def test_map(self):
    #     res = self.handler.map(sum)

    def test_apply(self):
        # agg, groupby=None, level='stamp'
        _write_df_to_handler(self)
        result = self.handler.apply('mean', level='stamp', selector='A')
        expected = pd.Series([2.5, 2.5, 2.5],
                             name='A',
                             index=pd.to_datetime(
                                 ['1994-01-01', '1994-02-01', '1994-03-01']))
        tm.assert_series_equal(result, expected)

        # agg, groupby=None, level='stamp', list selector
        result = self.handler.apply('mean', level='stamp', selector=['A'])
        expected = pd.DataFrame(expected)
        expected.index.names = ['stamp']
        tm.assert_frame_equal(result, expected)

        # with select kwargs
        result = self.handler.apply('mean',
                                    level='stamp',
                                    selector=['A'],
                                    select_kwargs={'columns': ['A']})
        expected = pd.DataFrame(expected)
        expected.index.names = ['stamp']
        tm.assert_frame_equal(result, expected)

        # agg, groupby='B', level='None'
        # will fail
        # result = self.handler.apply('mean', groupby='B', selector=['A'])
        # expected = pd.Series([1.5, 3.5], name='A', index=['a', 'b'])
        # tm.assert_series_equal(result, expected)

        # a list of aggs
        result = self.handler.apply(['mean', 'count'],
                                    level='stamp',
                                    selector=['A'])
        expected = pd.DataFrame(
            [[2.5, 4], [2.5, 4], [2.5, 4]],
            index=pd.to_datetime(['1994-01-01', '1994-02-01', '1994-03-01']))
        expected.index.name = 'stamp'
        expected.columns = pd.MultiIndex.from_tuples([('A', 'mean'),
                                                      ('A', 'count')])
        tm.assert_frame_equal(result, expected)

    def test_select(self):
        df = pd.DataFrame({'A': [1, 2, 3], 'B': ['a', 'b', 'c']})
        self.handler.write(df, 'm1994_01', format='table', append=False)

        # actual test
        result = self.handler.select('m1994_01')
        tm.assert_frame_equal(result, df)

        result = self.handler.select('m1994_01', columns=['A'])
        tm.assert_frame_equal(result, df[['A']])

    def test_select_all(self):
        df = _write_df_to_handler(self, return_df=True).sort_index()

        # actual test
        result = self.handler.select_all()
        tm.assert_frame_equal(result, df)

    def tearDown(self):
        self.handler.close()

        def unlink(subdir):
            for f in os.listdir(os.path.join('test_files', subdir)):
                file_path = os.path.join('.', 'test_files', subdir, f)
                try:
                    if os.path.isfile(file_path):
                        os.unlink(file_path)
                except OSError:
                    pass
                except Exception as e:
                    print(e)

            os.rmdir(os.path.join('test_files', subdir))

        unlink('panel')
        # unlink('long')
        os.rmdir('test_files')
예제 #2
0
def make_to_long(panel_h, settings, start=None, stop=None):
    """
    Let's chunk by quarters.
    """

    # need compensation for real wage
    with open('../panel_construction/settings.txt', 'rt') as f:
        settings = json.load(f)

    analyzed = pd.HDFStore(settings['analyzed_path'])
    comp = analyzed.select('bls_productivity_compensation')['compensation']
    prod = analyzed.select('bls_productivity_compensation')['productivity']

    keys = sorted(panel_h.stores.keys())

    m0 = start or keys[0]
    m0 = date_parser(m0)

    mn = stop or keys[-1]
    mn = date_parser(mn)

    months = [
        x.strftime('%Y_%m') for x in arrow.Arrow.range('month', m0, mn)
        if x.strftime('m%Y_%m') in keys
    ]

    # Getting some memory pressure. break into chunks, write each out.
    # read proccessed chucnks.
    # Chunking by quarter

    month_chunks = chunk_quarters(months, 3)
    month_chunks = [x for x in month_chunks if len(x) > 0]
    p = pathlib.Path(str(settings['base_path']))
    out_store = HDFHandler(str(p),
                           kind='long',
                           months=month_chunks,
                           frequency='Q')
    earn_store = HDFHandler(str(p),
                            kind='earn',
                            months=month_chunks,
                            frequency='Q')

    for chunk in month_chunks:
        # need the three month chunks... maybe zip up with out_stoure.
        # may need another dict.
        df = read_to_long(panel_h, chunk)
        name = make_chunk_name(chunk)

        # out_store.write(df, name, format='table', append=False)
        s = out_store.stores[name]

        # add in real hourly wage
        c = comp.reindex(df.index, level='stamp').fillna(method='ffill') / 100

        # adjust weight decimals
        df.loc[:, 'og_weight'] = df['og_weight'] / 10000

        # CPS reports earnings in cents
        df.loc[:, 'earnings'] = df['earnings'] / 100

        df['real_hr_earns'] = (df['earnings'] / df['hours']) / c
        df['real_hr_earns'] = df['real_hr_earns'].replace(np.inf,
                                                          np.nan)  # div by 0

        df = replace_categorical(df, kind='flow', inverse=True)
        with pd.get_store(s.filename) as store:
            df.to_hdf(store, name, format='table', append=False)

        #----------------------------------------------------------------
        # Also write out just earnings (nan issues so can't select later)
        # need to make real hrs fisrt.
        earn = df[~pd.isnull(df.real_hr_earns)]
        earn = earn[(earn.hours > 0) & (earn.earnings > 0)]

        s = earn_store.stores[name]
        with pd.get_store(s.filename) as store:
            earn.to_hdf(store,
                        name,
                        format='table',
                        append=False,
                        data_columns=True)
        print("Finished " + str(chunk))

    # finally, chunk by quarter and write out.
    df = earn_store.select_all().drop(['occupation', 'actual_hours'], axis=1)
    df = df.dropna(how='any', subset=['edu', 'age', 'flow', 'expr'])

    df = quarterize(df)

    df['productivity'] = prod.reindex(df.index, level='qmonth')
    df['real_hr_earns'] = df.real_hr_earns.replace(np.inf, np.nan)
    # df = add_demo_dummies(df)
    # model, res = construct_wage_index(df)
    # df.loc[:, 'wage_index_res'] = res.resid

    cln_path = '/Volumes/HDD/Users/tom/DataStorage/CPS/analyzed/clean.h5'

    with pd.get_store(cln_path) as store:
        df.to_hdf(store, 'cleaned', format='f', append=False)

    out_store.close()
    analyzed.close()
    earn_store.close()
예제 #3
0
class TestHDFWrapper(unittest.TestCase):

    def setUp(self):
        self.fdir = os.path.join('.', 'test_files', 'panel')
        months = ['1994_01', '1994_02', '1994_03']
        frequency = 'monthly'
        self.handler = HDFHandler('./test_files', 'panel', months, frequency)

    def test_file_creation(self):
        # _ = self.handler._select_stores(self.handler)
        expected = [os.path.join('test_files', 'panel', x) for x in
                    ('m1994_01.h5', 'm1994_02.h5', 'm1994_03.h5')]
        print(os.listdir('test_files'))

        assert all([os.path.exists(x) for x in expected])

    def test_create_from_list(self):
        months = [['1994_01', '1994_02', '1994_03']]
        frequency = 'Q'
        handler = HDFHandler('./test_files', kind='panel', months=months,
                             frequency=frequency)
        self.assertEqual(handler.stores.keys(), ['long_1994_Q1'])
        handler.close()

    def test_getitem(self):
        result = self.handler['m1994_01']
        expected = self.handler.stores['m1994_01']
        self.assertIs(result, expected)

    def test_write(self):
        df = pd.DataFrame({'A': [1, 2, 3]})
        self.handler.write(df, 'm1994_01', format='f', append=False)
        res = self.handler.select('m1994_01')
        tm.assert_frame_equal(df, res)

    def test_iter(self):
        result = [x for x in self.handler]
        expected = ['m1994_01', 'm1994_02', 'm1994_03']
        self.assertEqual(result, expected)

    # def test_select_all(self):
    #     import ipdb; ipdb.set_trace()
    #     h = HDFHandler(self.settings, 'panel', frequency='monthly')
    #     assert len(h.stores) == 3

    def test_sanitize_key(self):
        result = self.handler._sanitize_key('1994-01')
        expected = 'm1994_01'
        self.assertEqual(result, expected)

        result = self.handler._sanitize_key('1994_01')
        self.assertEqual(result, expected)

        result = self.handler._sanitize_key('m1994-01')
        self.assertEqual(result, expected)

        # With a different pre
        settings = {'base_path': './test_files/'}
        months = [['1994_01', '1994_02', '1994_03']]
        frequency = 'Q'
        # handler = HDFHandler(settings, kind='long', months=months,
        #                      frequency=frequency)
        # result = handler._sanitize_key('long_1996_Q1')
        # print(handler.pre)
        # expected = "long_1996_Q1"
        # self.assertEqual(result, expected)

    def test_iteritems(self):
        g = self.handler.iteritems()
        name, value = next(g)
        self.assertEqual(name, 'm1994_01')
        self.assertIs(value, None)

    # getting a `is not a regular file` error.
    def test_from_directory(self):
        # setup should have created some.
        os.mkdir('from_dir')
        with open(os.path.join('from_dir', 'file5.h5'), 'w'):
            pass
        try:
            handler = HDFHandler.from_directory('from_dir', kind='M')
            self.assertEqual(len(handler.stores), 1)
            handler.close()
        except Exception as e:
            print(e)
        finally:
            os.remove(os.path.join('from_dir', 'file5.h5'))
            os.removedirs('from_dir')

    # def test_map(self):
    #     res = self.handler.map(sum)

    def test_apply(self):
        # agg, groupby=None, level='stamp'
        _write_df_to_handler(self)
        result = self.handler.apply('mean', level='stamp', selector='A')
        expected = pd.Series([2.5, 2.5, 2.5], name='A',
                             index=pd.to_datetime(['1994-01-01',
                                                   '1994-02-01',
                                                   '1994-03-01']))
        tm.assert_series_equal(result, expected)

        # agg, groupby=None, level='stamp', list selector
        result = self.handler.apply('mean', level='stamp', selector=['A'])
        expected = pd.DataFrame(expected)
        expected.index.names = ['stamp']
        tm.assert_frame_equal(result, expected)

        # with select kwargs
        result = self.handler.apply('mean', level='stamp', selector=['A'],
                                    select_kwargs={'columns': ['A']})
        expected = pd.DataFrame(expected)
        expected.index.names = ['stamp']
        tm.assert_frame_equal(result, expected)

        # agg, groupby='B', level='None'
        # will fail
        # result = self.handler.apply('mean', groupby='B', selector=['A'])
        # expected = pd.Series([1.5, 3.5], name='A', index=['a', 'b'])
        # tm.assert_series_equal(result, expected)

        # a list of aggs
        result = self.handler.apply(['mean', 'count'], level='stamp',
                                    selector=['A'])
        expected = pd.DataFrame([[2.5, 4], [2.5, 4], [2.5, 4]],
                                index=pd.to_datetime(['1994-01-01',
                                                     '1994-02-01',
                                                     '1994-03-01']))
        expected.index.name = 'stamp'
        expected.columns = pd.MultiIndex.from_tuples([('A', 'mean'), ('A', 'count')])
        tm.assert_frame_equal(result, expected)

    def test_select(self):
        df = pd.DataFrame({'A': [1, 2, 3], 'B': ['a', 'b', 'c']})
        self.handler.write(df, 'm1994_01', format='table', append=False)

        # actual test
        result = self.handler.select('m1994_01')
        tm.assert_frame_equal(result, df)

        result = self.handler.select('m1994_01', columns=['A'])
        tm.assert_frame_equal(result, df[['A']])

    def test_select_all(self):
        df = _write_df_to_handler(self, return_df=True).sort_index()

        # actual test
        result = self.handler.select_all()
        tm.assert_frame_equal(result, df)

    def tearDown(self):
        self.handler.close()

        def unlink(subdir):
            for f in os.listdir(os.path.join('test_files', subdir)):
                file_path = os.path.join('.', 'test_files', subdir, f)
                try:
                    if os.path.isfile(file_path):
                        os.unlink(file_path)
                except OSError:
                    pass
                except Exception as e:
                    print(e)

            os.rmdir(os.path.join('test_files', subdir))
        unlink('panel')
        # unlink('long')
        os.rmdir('test_files')
예제 #4
0
def make_to_long(panel_h, settings, start=None, stop=None):
    """
    Let's chunk by quarters.
    """

    # need compensation for real wage
    with open('../panel_construction/settings.txt', 'rt') as f:
        settings = json.load(f)

    analyzed = pd.HDFStore(settings['analyzed_path'])
    comp = analyzed.select('bls_productivity_compensation')['compensation']
    prod = analyzed.select('bls_productivity_compensation')['productivity']

    keys = sorted(panel_h.stores.keys())

    m0 = start or keys[0]
    m0 = date_parser(m0)

    mn = stop or keys[-1]
    mn = date_parser(mn)

    months = [x.strftime('%Y_%m') for x in arrow.Arrow.range('month', m0, mn)
              if x.strftime('m%Y_%m') in keys]

    # Getting some memory pressure. break into chunks, write each out.
    # read proccessed chucnks.
    # Chunking by quarter

    month_chunks = chunk_quarters(months, 3)
    month_chunks = [x for x in month_chunks if len(x) > 0]
    p = pathlib.Path(str(settings['base_path']))
    out_store = HDFHandler(str(p), kind='long', months=month_chunks,
                           frequency='Q')
    earn_store = HDFHandler(str(p), kind='earn', months=month_chunks,
                            frequency='Q')

    for chunk in month_chunks:
        # need the three month chunks... maybe zip up with out_stoure.
        # may need another dict.
        df = read_to_long(panel_h, chunk)
        name = make_chunk_name(chunk)

        # out_store.write(df, name, format='table', append=False)
        s = out_store.stores[name]

        # add in real hourly wage
        c = comp.reindex(df.index, level='stamp').fillna(method='ffill') / 100

        # adjust weight decimals
        df.loc[:, 'og_weight'] = df['og_weight'] / 10000

        # CPS reports earnings in cents
        df.loc[:, 'earnings'] = df['earnings'] / 100

        df['real_hr_earns'] = (df['earnings'] / df['hours']) / c
        df['real_hr_earns'] = df['real_hr_earns'].replace(np.inf, np.nan)  # div by 0

        df = replace_categorical(df, kind='flow', inverse=True)
        with pd.get_store(s.filename) as store:
            df.to_hdf(store, name, format='table', append=False)

        #----------------------------------------------------------------
        # Also write out just earnings (nan issues so can't select later)
        # need to make real hrs fisrt.
        earn = df[~pd.isnull(df.real_hr_earns)]
        earn = earn[(earn.hours > 0) & (earn.earnings > 0)]

        s = earn_store.stores[name]
        with pd.get_store(s.filename) as store:
            earn.to_hdf(store, name, format='table', append=False,
                        data_columns=True)
        print("Finished " + str(chunk))

    # finally, chunk by quarter and write out.
    df = earn_store.select_all().drop(['occupation', 'actual_hours'], axis=1)
    df = df.dropna(how='any', subset=['edu', 'age', 'flow', 'expr'])

    df = quarterize(df)

    df['productivity'] = prod.reindex(df.index, level='qmonth')
    df['real_hr_earns'] = df.real_hr_earns.replace(np.inf, np.nan)
    # df = add_demo_dummies(df)
    # model, res = construct_wage_index(df)
    # df.loc[:, 'wage_index_res'] = res.resid

    cln_path = '/Volumes/HDD/Users/tom/DataStorage/CPS/analyzed/clean.h5'

    with pd.get_store(cln_path) as store:
        df.to_hdf(store, 'cleaned', format='f', append=False)

    out_store.close()
    analyzed.close()
    earn_store.close()