def test_replace_catagorical(self): df = pd.DataFrame({ 'sex': [1, 2], 'race': [1, 2], 'married': [1, 4], 'labor_status': [1, 2], 'industry': [1, 3], 'occupation': [1, 7], 'edu': [31, 35], 'flow': [1, 3], 'history': [0, 1] }) df_ = df.copy() expected = pd.DataFrame({ 'sex': ['male', 'female'], 'race': ['White Only', 'Black Only'], 'married': ["MARRIED, CIVILIAN SPOUSE PRESENT", "WIDOWED"], 'labor_status': ['employed', 'absent'], 'industry': ["Agriculture", "Mining"], 'occupation': ["Management", "Legal"], 'edu': ["LESS THAN 1ST GRADE", "9TH GRADE"], 'flow': ['ee', 'en'], 'history': ['employed', 'not_employed'] }) # full result = helpers.replace_categorical(df_) tm.assert_frame_equal(result, expected) for k in df.columns: df_ = df.copy() r1 = helpers.replace_categorical(df_, kind=k) ef = df.copy() ef[k] = expected[k] tm.assert_frame_equal(r1, ef) # inverse inv = helpers.replace_categorical(expected, inverse=True) tm.assert_frame_equal(inv, df) s = pd.DataFrame( {"flow": ['ee', 'eu', 'en', 'ue', 'uu', 'un', 'ne', 'nu', 'nn']}) expected = pd.DataFrame({"flow": [1, 2, 3, 4, 5, 6, 7, 8, 9]}) result = helpers.replace_categorical(s, kind='flow', inverse=True) tm.assert_frame_equal(result, expected)
def test_replace_catagorical(self): df = pd.DataFrame({'sex': [1, 2], 'race': [1, 2], 'married': [1, 4], 'labor_status': [1, 2], 'industry': [1, 3], 'occupation': [1, 7], 'edu': [31, 35], 'flow': [1, 3], 'history': [0, 1]}) df_ = df.copy() expected = pd.DataFrame({'sex': ['male', 'female'], 'race': ['White Only', 'Black Only'], 'married': ["MARRIED, CIVILIAN SPOUSE PRESENT", "WIDOWED"], 'labor_status': ['employed', 'absent'], 'industry': ["Agriculture", "Mining"], 'occupation': ["Management", "Legal"], 'edu': ["LESS THAN 1ST GRADE", "9TH GRADE"], 'flow': ['ee', 'en'], 'history': ['employed', 'not_employed']}) # full result = helpers.replace_categorical(df_) tm.assert_frame_equal(result, expected) for k in df.columns: df_ = df.copy() r1 = helpers.replace_categorical(df_, kind=k) ef = df.copy() ef[k] = expected[k] tm.assert_frame_equal(r1, ef) # inverse inv = helpers.replace_categorical(expected, inverse=True) tm.assert_frame_equal(inv, df) s = pd.DataFrame({"flow": ['ee', 'eu', 'en', 'ue', 'uu', 'un', 'ne', 'nu', 'nn']}) expected = pd.DataFrame({"flow": [1, 2, 3, 4, 5, 6, 7, 8, 9]}) result = helpers.replace_categorical(s, kind='flow', inverse=True) tm.assert_frame_equal(result, expected)
def make_to_long(panel_h, settings, start=None, stop=None): """ Let's chunk by quarters. """ # need compensation for real wage with open('../panel_construction/settings.txt', 'rt') as f: settings = json.load(f) analyzed = pd.HDFStore(settings['analyzed_path']) comp = analyzed.select('bls_productivity_compensation')['compensation'] prod = analyzed.select('bls_productivity_compensation')['productivity'] keys = sorted(panel_h.stores.keys()) m0 = start or keys[0] m0 = date_parser(m0) mn = stop or keys[-1] mn = date_parser(mn) months = [ x.strftime('%Y_%m') for x in arrow.Arrow.range('month', m0, mn) if x.strftime('m%Y_%m') in keys ] # Getting some memory pressure. break into chunks, write each out. # read proccessed chucnks. # Chunking by quarter month_chunks = chunk_quarters(months, 3) month_chunks = [x for x in month_chunks if len(x) > 0] p = pathlib.Path(str(settings['base_path'])) out_store = HDFHandler(str(p), kind='long', months=month_chunks, frequency='Q') earn_store = HDFHandler(str(p), kind='earn', months=month_chunks, frequency='Q') for chunk in month_chunks: # need the three month chunks... maybe zip up with out_stoure. # may need another dict. df = read_to_long(panel_h, chunk) name = make_chunk_name(chunk) # out_store.write(df, name, format='table', append=False) s = out_store.stores[name] # add in real hourly wage c = comp.reindex(df.index, level='stamp').fillna(method='ffill') / 100 # adjust weight decimals df.loc[:, 'og_weight'] = df['og_weight'] / 10000 # CPS reports earnings in cents df.loc[:, 'earnings'] = df['earnings'] / 100 df['real_hr_earns'] = (df['earnings'] / df['hours']) / c df['real_hr_earns'] = df['real_hr_earns'].replace(np.inf, np.nan) # div by 0 df = replace_categorical(df, kind='flow', inverse=True) with pd.get_store(s.filename) as store: df.to_hdf(store, name, format='table', append=False) #---------------------------------------------------------------- # Also write out just earnings (nan issues so can't select later) # need to make real hrs fisrt. earn = df[~pd.isnull(df.real_hr_earns)] earn = earn[(earn.hours > 0) & (earn.earnings > 0)] s = earn_store.stores[name] with pd.get_store(s.filename) as store: earn.to_hdf(store, name, format='table', append=False, data_columns=True) print("Finished " + str(chunk)) # finally, chunk by quarter and write out. df = earn_store.select_all().drop(['occupation', 'actual_hours'], axis=1) df = df.dropna(how='any', subset=['edu', 'age', 'flow', 'expr']) df = quarterize(df) df['productivity'] = prod.reindex(df.index, level='qmonth') df['real_hr_earns'] = df.real_hr_earns.replace(np.inf, np.nan) # df = add_demo_dummies(df) # model, res = construct_wage_index(df) # df.loc[:, 'wage_index_res'] = res.resid cln_path = '/Volumes/HDD/Users/tom/DataStorage/CPS/analyzed/clean.h5' with pd.get_store(cln_path) as store: df.to_hdf(store, 'cleaned', format='f', append=False) out_store.close() analyzed.close() earn_store.close()
def make_to_long(panel_h, settings, start=None, stop=None): """ Let's chunk by quarters. """ # need compensation for real wage with open('../panel_construction/settings.txt', 'rt') as f: settings = json.load(f) analyzed = pd.HDFStore(settings['analyzed_path']) comp = analyzed.select('bls_productivity_compensation')['compensation'] prod = analyzed.select('bls_productivity_compensation')['productivity'] keys = sorted(panel_h.stores.keys()) m0 = start or keys[0] m0 = date_parser(m0) mn = stop or keys[-1] mn = date_parser(mn) months = [x.strftime('%Y_%m') for x in arrow.Arrow.range('month', m0, mn) if x.strftime('m%Y_%m') in keys] # Getting some memory pressure. break into chunks, write each out. # read proccessed chucnks. # Chunking by quarter month_chunks = chunk_quarters(months, 3) month_chunks = [x for x in month_chunks if len(x) > 0] p = pathlib.Path(str(settings['base_path'])) out_store = HDFHandler(str(p), kind='long', months=month_chunks, frequency='Q') earn_store = HDFHandler(str(p), kind='earn', months=month_chunks, frequency='Q') for chunk in month_chunks: # need the three month chunks... maybe zip up with out_stoure. # may need another dict. df = read_to_long(panel_h, chunk) name = make_chunk_name(chunk) # out_store.write(df, name, format='table', append=False) s = out_store.stores[name] # add in real hourly wage c = comp.reindex(df.index, level='stamp').fillna(method='ffill') / 100 # adjust weight decimals df.loc[:, 'og_weight'] = df['og_weight'] / 10000 # CPS reports earnings in cents df.loc[:, 'earnings'] = df['earnings'] / 100 df['real_hr_earns'] = (df['earnings'] / df['hours']) / c df['real_hr_earns'] = df['real_hr_earns'].replace(np.inf, np.nan) # div by 0 df = replace_categorical(df, kind='flow', inverse=True) with pd.get_store(s.filename) as store: df.to_hdf(store, name, format='table', append=False) #---------------------------------------------------------------- # Also write out just earnings (nan issues so can't select later) # need to make real hrs fisrt. earn = df[~pd.isnull(df.real_hr_earns)] earn = earn[(earn.hours > 0) & (earn.earnings > 0)] s = earn_store.stores[name] with pd.get_store(s.filename) as store: earn.to_hdf(store, name, format='table', append=False, data_columns=True) print("Finished " + str(chunk)) # finally, chunk by quarter and write out. df = earn_store.select_all().drop(['occupation', 'actual_hours'], axis=1) df = df.dropna(how='any', subset=['edu', 'age', 'flow', 'expr']) df = quarterize(df) df['productivity'] = prod.reindex(df.index, level='qmonth') df['real_hr_earns'] = df.real_hr_earns.replace(np.inf, np.nan) # df = add_demo_dummies(df) # model, res = construct_wage_index(df) # df.loc[:, 'wage_index_res'] = res.resid cln_path = '/Volumes/HDD/Users/tom/DataStorage/CPS/analyzed/clean.h5' with pd.get_store(cln_path) as store: df.to_hdf(store, 'cleaned', format='f', append=False) out_store.close() analyzed.close() earn_store.close()