예제 #1
0
    def run_step(self, prev, params):
        df = prev

        # replace members in dimensions
        df['sex'].replace(SEX, inplace=True)
        df['person_type'] = df['person_type'].apply(lambda x: norm(x)).str.lower()
        df['person_type'].replace(PERSON_TYPE, inplace=True)
        df['age_range'].replace(AGE_RANGE, inplace=True)
        
        for col in ['ent_id', 'mun_id']:
            df[col] = df[col].apply(lambda x: norm(x)).str.upper()

        # replace missing municipalities
        df['mun_id'].replace(MISSING_MUN, inplace=True)

        # replace ent
        df['ent_id'].replace({'MEXICO': 15}, inplace=True)

        # replace names for ids
        ent, mun = replace_geo()
        df['ent_id'] = df['ent_id'].replace(ent)
        df['mun_id'] = df['mun_id'].replace(mun)

        df.loc[~df['mun_id'].isin(list(mun.values())), 'mun_id'] = \
            df.loc[~df['mun_id'].isin(list(mun.values())), 'ent_id'].astype(str) + '999'

        df = df[['mun_id', 'level', 'sex', 'person_type', 'age_range', 'count']].copy()

        for col in df.columns[df.columns != 'level']:
            df[col] = df[col].astype(int)

        return df
예제 #2
0
    def run_step(self, prev, params):
        df = prev
        # filter confidential values
        df = df.loc[df['count'].astype(str).str.lower() != 'c'].copy()
        for col in ['sex', 'age_range']:
            df[col] = df[col].replace({'c': 0})

        # replace members in dimensions
        df['person_type'] = df['person_type'].str.strip().str.lower().apply(
            lambda x: norm(x))
        df['sex'].replace(SEX, inplace=True)
        df['person_type'].replace(PERSON_TYPE, inplace=True)
        df['age_range'].replace(AGE_RANGE, inplace=True)
        df.drop(columns=['company_size'], inplace=True)

        df.loc[df['ent_id'].isna(), 'ent_id'] = '33'
        df.loc[df['mun_id'].isna(), 'mun_id'] = '33000'

        for col in ['ent_id', 'mun_id']:
            df[col] = df[col].apply(lambda x: norm(x)).str.upper()

        # replace missing municipalities
        df['mun_id'].replace(MISSING_MUN, inplace=True)

        # replace ent
        df['ent_id'].replace({'MEXICO': 15}, inplace=True)

        # replace entity names for ids
        ent, mun = replace_geo()
        df['ent_id'] = df['ent_id'].replace(ent)

        # variable clave denotes "mun_id"
        df = df.drop("mun_id", axis=1)
        df = df.rename(columns={"clave": "mun_id"})

        df.loc[df['mun_id'].isin(df.ent_id.unique()), 'mun_id'] = \
            df.loc[df['mun_id'].isin(df.ent_id.unique()), 'ent_id'].astype(str) + '999'

        df = df[[
            'ent_id', 'mun_id', 'sex', 'person_type', 'age_range', 'count',
            'level'
        ]].copy()

        for col in df.columns[df.columns != 'level']:
            try:
                df[col] = df[col].astype(int)
            except ValueError:
                print('Column {} to float type'.format(col))
                df[col] = df[col].astype(float)

        return df