Exemplos de replace_categorical em Python, exemplos de data_wrangling.cps_wrangling.analysis.helpers.replace_categorical em Python

Exemplo n.º 1

0

Exibir arquivo

    def test_replace_catagorical(self):
        df = pd.DataFrame({
            'sex': [1, 2],
            'race': [1, 2],
            'married': [1, 4],
            'labor_status': [1, 2],
            'industry': [1, 3],
            'occupation': [1, 7],
            'edu': [31, 35],
            'flow': [1, 3],
            'history': [0, 1]
        })
        df_ = df.copy()
        expected = pd.DataFrame({
            'sex': ['male', 'female'],
            'race': ['White Only', 'Black Only'],
            'married': ["MARRIED, CIVILIAN SPOUSE PRESENT", "WIDOWED"],
            'labor_status': ['employed', 'absent'],
            'industry': ["Agriculture", "Mining"],
            'occupation': ["Management", "Legal"],
            'edu': ["LESS THAN 1ST GRADE", "9TH GRADE"],
            'flow': ['ee', 'en'],
            'history': ['employed', 'not_employed']
        })
        # full
        result = helpers.replace_categorical(df_)
        tm.assert_frame_equal(result, expected)

        for k in df.columns:
            df_ = df.copy()
            r1 = helpers.replace_categorical(df_, kind=k)
            ef = df.copy()
            ef[k] = expected[k]
            tm.assert_frame_equal(r1, ef)

        # inverse
        inv = helpers.replace_categorical(expected, inverse=True)
        tm.assert_frame_equal(inv, df)

        s = pd.DataFrame(
            {"flow": ['ee', 'eu', 'en', 'ue', 'uu', 'un', 'ne', 'nu', 'nn']})
        expected = pd.DataFrame({"flow": [1, 2, 3, 4, 5, 6, 7, 8, 9]})
        result = helpers.replace_categorical(s, kind='flow', inverse=True)
        tm.assert_frame_equal(result, expected)

Exemplo n.º 2

0

Exibir arquivo

Arquivo: test_helpers.py Projeto: TomAugspurger/dnwr-zlb

    def test_replace_catagorical(self):
        df = pd.DataFrame({'sex': [1, 2],
                           'race': [1, 2],
                           'married': [1, 4],
                           'labor_status': [1, 2],
                           'industry': [1, 3],
                           'occupation': [1, 7],
                           'edu': [31, 35],
                           'flow': [1, 3],
                           'history': [0, 1]})
        df_ = df.copy()
        expected = pd.DataFrame({'sex': ['male', 'female'],
                                 'race': ['White Only', 'Black Only'],
                                 'married': ["MARRIED, CIVILIAN SPOUSE PRESENT", "WIDOWED"],
                                 'labor_status': ['employed', 'absent'],
                                 'industry': ["Agriculture", "Mining"],
                                 'occupation': ["Management", "Legal"],
                                 'edu': ["LESS THAN 1ST GRADE", "9TH GRADE"],
                                 'flow': ['ee', 'en'],
                                 'history': ['employed', 'not_employed']})
        # full
        result = helpers.replace_categorical(df_)
        tm.assert_frame_equal(result, expected)

        for k in df.columns:
            df_ = df.copy()
            r1 = helpers.replace_categorical(df_, kind=k)
            ef = df.copy()
            ef[k] = expected[k]
            tm.assert_frame_equal(r1, ef)

        # inverse
        inv = helpers.replace_categorical(expected, inverse=True)
        tm.assert_frame_equal(inv, df)

        s = pd.DataFrame({"flow": ['ee', 'eu', 'en', 'ue', 'uu', 'un', 'ne',
                                   'nu', 'nn']})
        expected = pd.DataFrame({"flow": [1, 2, 3, 4, 5, 6, 7, 8, 9]})
        result = helpers.replace_categorical(s, kind='flow', inverse=True)
        tm.assert_frame_equal(result, expected)

Exemplo n.º 3

0

Exibir arquivo

Arquivo: make_to_long.py Projeto: vergil7777/dnwr-zlb

def make_to_long(panel_h, settings, start=None, stop=None):
    """
    Let's chunk by quarters.
    """

    # need compensation for real wage
    with open('../panel_construction/settings.txt', 'rt') as f:
        settings = json.load(f)

    analyzed = pd.HDFStore(settings['analyzed_path'])
    comp = analyzed.select('bls_productivity_compensation')['compensation']
    prod = analyzed.select('bls_productivity_compensation')['productivity']

    keys = sorted(panel_h.stores.keys())

    m0 = start or keys[0]
    m0 = date_parser(m0)

    mn = stop or keys[-1]
    mn = date_parser(mn)

    months = [
        x.strftime('%Y_%m') for x in arrow.Arrow.range('month', m0, mn)
        if x.strftime('m%Y_%m') in keys
    ]

    # Getting some memory pressure. break into chunks, write each out.
    # read proccessed chucnks.
    # Chunking by quarter

    month_chunks = chunk_quarters(months, 3)
    month_chunks = [x for x in month_chunks if len(x) > 0]
    p = pathlib.Path(str(settings['base_path']))
    out_store = HDFHandler(str(p),
                           kind='long',
                           months=month_chunks,
                           frequency='Q')
    earn_store = HDFHandler(str(p),
                            kind='earn',
                            months=month_chunks,
                            frequency='Q')

    for chunk in month_chunks:
        # need the three month chunks... maybe zip up with out_stoure.
        # may need another dict.
        df = read_to_long(panel_h, chunk)
        name = make_chunk_name(chunk)

        # out_store.write(df, name, format='table', append=False)
        s = out_store.stores[name]

        # add in real hourly wage
        c = comp.reindex(df.index, level='stamp').fillna(method='ffill') / 100

        # adjust weight decimals
        df.loc[:, 'og_weight'] = df['og_weight'] / 10000

        # CPS reports earnings in cents
        df.loc[:, 'earnings'] = df['earnings'] / 100

        df['real_hr_earns'] = (df['earnings'] / df['hours']) / c
        df['real_hr_earns'] = df['real_hr_earns'].replace(np.inf,
                                                          np.nan)  # div by 0

        df = replace_categorical(df, kind='flow', inverse=True)
        with pd.get_store(s.filename) as store:
            df.to_hdf(store, name, format='table', append=False)

        #----------------------------------------------------------------
        # Also write out just earnings (nan issues so can't select later)
        # need to make real hrs fisrt.
        earn = df[~pd.isnull(df.real_hr_earns)]
        earn = earn[(earn.hours > 0) & (earn.earnings > 0)]

        s = earn_store.stores[name]
        with pd.get_store(s.filename) as store:
            earn.to_hdf(store,
                        name,
                        format='table',
                        append=False,
                        data_columns=True)
        print("Finished " + str(chunk))

    # finally, chunk by quarter and write out.
    df = earn_store.select_all().drop(['occupation', 'actual_hours'], axis=1)
    df = df.dropna(how='any', subset=['edu', 'age', 'flow', 'expr'])

    df = quarterize(df)

    df['productivity'] = prod.reindex(df.index, level='qmonth')
    df['real_hr_earns'] = df.real_hr_earns.replace(np.inf, np.nan)
    # df = add_demo_dummies(df)
    # model, res = construct_wage_index(df)
    # df.loc[:, 'wage_index_res'] = res.resid

    cln_path = '/Volumes/HDD/Users/tom/DataStorage/CPS/analyzed/clean.h5'

    with pd.get_store(cln_path) as store:
        df.to_hdf(store, 'cleaned', format='f', append=False)

    out_store.close()
    analyzed.close()
    earn_store.close()

Exemplo n.º 4

0

Exibir arquivo

Arquivo: make_to_long.py Projeto: TomAugspurger/dnwr-zlb

def make_to_long(panel_h, settings, start=None, stop=None):
    """
    Let's chunk by quarters.
    """

    # need compensation for real wage
    with open('../panel_construction/settings.txt', 'rt') as f:
        settings = json.load(f)

    analyzed = pd.HDFStore(settings['analyzed_path'])
    comp = analyzed.select('bls_productivity_compensation')['compensation']
    prod = analyzed.select('bls_productivity_compensation')['productivity']

    keys = sorted(panel_h.stores.keys())

    m0 = start or keys[0]
    m0 = date_parser(m0)

    mn = stop or keys[-1]
    mn = date_parser(mn)

    months = [x.strftime('%Y_%m') for x in arrow.Arrow.range('month', m0, mn)
              if x.strftime('m%Y_%m') in keys]

    # Getting some memory pressure. break into chunks, write each out.
    # read proccessed chucnks.
    # Chunking by quarter

    month_chunks = chunk_quarters(months, 3)
    month_chunks = [x for x in month_chunks if len(x) > 0]
    p = pathlib.Path(str(settings['base_path']))
    out_store = HDFHandler(str(p), kind='long', months=month_chunks,
                           frequency='Q')
    earn_store = HDFHandler(str(p), kind='earn', months=month_chunks,
                            frequency='Q')

    for chunk in month_chunks:
        # need the three month chunks... maybe zip up with out_stoure.
        # may need another dict.
        df = read_to_long(panel_h, chunk)
        name = make_chunk_name(chunk)

        # out_store.write(df, name, format='table', append=False)
        s = out_store.stores[name]

        # add in real hourly wage
        c = comp.reindex(df.index, level='stamp').fillna(method='ffill') / 100

        # adjust weight decimals
        df.loc[:, 'og_weight'] = df['og_weight'] / 10000

        # CPS reports earnings in cents
        df.loc[:, 'earnings'] = df['earnings'] / 100

        df['real_hr_earns'] = (df['earnings'] / df['hours']) / c
        df['real_hr_earns'] = df['real_hr_earns'].replace(np.inf, np.nan)  # div by 0

        df = replace_categorical(df, kind='flow', inverse=True)
        with pd.get_store(s.filename) as store:
            df.to_hdf(store, name, format='table', append=False)

        #----------------------------------------------------------------
        # Also write out just earnings (nan issues so can't select later)
        # need to make real hrs fisrt.
        earn = df[~pd.isnull(df.real_hr_earns)]
        earn = earn[(earn.hours > 0) & (earn.earnings > 0)]

        s = earn_store.stores[name]
        with pd.get_store(s.filename) as store:
            earn.to_hdf(store, name, format='table', append=False,
                        data_columns=True)
        print("Finished " + str(chunk))

    # finally, chunk by quarter and write out.
    df = earn_store.select_all().drop(['occupation', 'actual_hours'], axis=1)
    df = df.dropna(how='any', subset=['edu', 'age', 'flow', 'expr'])

    df = quarterize(df)

    df['productivity'] = prod.reindex(df.index, level='qmonth')
    df['real_hr_earns'] = df.real_hr_earns.replace(np.inf, np.nan)
    # df = add_demo_dummies(df)
    # model, res = construct_wage_index(df)
    # df.loc[:, 'wage_index_res'] = res.resid

    cln_path = '/Volumes/HDD/Users/tom/DataStorage/CPS/analyzed/clean.h5'

    with pd.get_store(cln_path) as store:
        df.to_hdf(store, 'cleaned', format='f', append=False)

    out_store.close()
    analyzed.close()
    earn_store.close()