예제 #1
0
    def run_step(self, prev, params):
        df = prev
        print('Year YearInvestment: fdi_year_investment_industry')

        if 'Unnamed: 0' in df.columns:
            df.drop('Unnamed: 0', axis=1, inplace=True)
        pk_id = [
            x for x in df.columns
            if ('id' in x) & ('country' not in x) & ('ent_id' not in x)
        ][0]

        df = df.loc[df[pk_id] != 'Total general'].copy()

        split = df[pk_id].str.split(' ', n=1, expand=True)
        df[pk_id] = split[0]
        df[pk_id] = df[pk_id].astype(int)

        df['value_c'] = df['value_c'].astype(str).str.lower()

        temp = pd.DataFrame()
        for country in list(df['investment_type'].unique()):
            temp = temp.append(
                validate_category(df.loc[(df['investment_type'] == country)],
                                  pk_id, 'value_c', 'c'))

        df = temp.copy()
        temp = pd.DataFrame()

        level = ['sector_id', 'subsector_id', 'industry_group_id']
        for i in level:
            if i != pk_id:
                df[i] = 0

        df['sector_id'].replace(SECTOR_REPLACE, inplace=True)
        df['sector_id'] = df['sector_id'].astype(str)

        df = df.loc[df['value_c'] != 'c'].copy()

        df.drop(columns=['value_c'], inplace=True)

        df['investment_type'].replace(INVESTMENT_TYPE, inplace=True)

        df[['year', 'value', 'count', 'investment_type', 'industry_group_id', 'subsector_id']] = \
            df[['year', 'value', 'count', 'investment_type', 'industry_group_id', 'subsector_id']].astype(float)

        return df
예제 #2
0
    def run_step(self, prev, params):
        df = prev
        print('Investmen STEP: fdi_quarter_industry_investment')
        pk_id = params.get('pk')

        df = df.loc[df[pk_id] != 'Total general'].copy()

        df['quarter_id'] = (
            df['year'].astype(int).astype(str) +
            df['quarter_id'].astype(int).astype(str)).astype(int)
        df.drop(columns=['year'], inplace=True)

        split = df[pk_id].str.split(' ', n=1, expand=True)
        df[pk_id] = split[0]
        df[pk_id] = df[pk_id].astype(int)

        level = ['sector_id', 'subsector_id', 'industry_group_id']
        for i in level:
            if i != pk_id:
                df[i] = 0

        df['sector_id'].replace(SECTOR_REPLACE, inplace=True)
        df['sector_id'] = df['sector_id'].astype(str)

        df['value_c'] = df['value_c'].astype(str).str.lower()

        temp = pd.DataFrame()
        for investment_type in list(df['investment_type'].unique()):
            temp = temp.append(
                validate_category(
                    df.loc[(df['investment_type'] == investment_type)], pk_id,
                    'value_c', 'c'))

        df = temp.copy()
        temp = pd.DataFrame()

        df = df.loc[df['value_c'] != 'c'].copy()

        df['investment_type'].replace(INVESTMENT_TYPE, inplace=True)

        df[['value', 'count', 'investment_type', 'quarter_id', 'industry_group_id', 'subsector_id']] = \
            df[['value', 'count', 'investment_type', 'quarter_id', 'industry_group_id', 'subsector_id']].astype(float)

        return df
예제 #3
0
    def run_step(self, prev, params):
        df = prev
        print('Year YearQuarter: fdi_quarter_industry')
        pk_id = [
            x for x in df.columns
            if ('id' in x) & ('country' not in x) & ('ent_id' not in x)
        ][0]

        df = df.loc[df[pk_id] != 'Total general'].copy()

        df['quarter_id'] = (
            df['year'].astype(int).astype(str) +
            df['quarter_id'].astype(int).astype(str)).astype(int)
        df.drop(columns=['year'], inplace=True)

        split = df[pk_id].str.split(' ', n=1, expand=True)
        df[pk_id] = split[0]
        df[pk_id] = df[pk_id].astype(int)

        df['value_c'] = df['value_c'].astype(str).str.lower()
        """temp = pd.DataFrame()
        for country in list(df['ent_id'].unique()):
            temp = temp.append(validate_category(df.loc[(df['ent_id'] == country)], pk_id, 'value_c', 'c'))

        df = temp.copy()
        temp = pd.DataFrame()"""
        df = validate_category(df, pk_id, 'value_c', 'c')

        level = ['sector_id', 'subsector_id', 'industry_group_id']
        for i in level:
            if i != pk_id:
                df[i] = 0

        df['sector_id'].replace(SECTOR_REPLACE, inplace=True)
        df['sector_id'] = df['sector_id'].astype(str)

        df = df.loc[df['value_c'] != 'c'].copy()

        df = df.loc[df['value_c'] != 'false'].copy()

        df[['quarter_id', 'value_c', 'count', 'industry_group_id', 'subsector_id']] = \
            df[['quarter_id', 'value_c', 'count', 'industry_group_id', 'subsector_id']].astype(float)

        return df
예제 #4
0
파일: fdi_2.py 프로젝트: anabcm/data-etl
    def run_step(self, prev, params):
        data = prev
        df = pd.read_excel(data, sheet_name='2.4')
        df.columns = [
            norm(x.strip().lower().replace(' ', '_').replace('-', '_').replace(
                '%', 'perc')) for x in df.columns
        ]
        print(df.columns)
        df = df.loc[~df['entidad_federativa'].str.contains('Total')].copy()

        # get end_id dimension
        dim_geo = get_dimensions()[0]

        df['entidad_federativa'].replace(dict(
            zip(dim_geo['ent_name'], dim_geo['ent_id'])),
                                         inplace=True)

        df.columns = [
            'ent_id', 'year', 'quarter_id', 'value_between_companies',
            'value_new_investments', 'value_re_investments',
            'count_between_companies', 'count_new_investments',
            'count_re_investments', 'value_between_companies_c',
            'value_new_investments_c', 'value_re_investments_c'
        ]

        df['quarter_id'] = df['year'].astype(int).astype(
            str) + df['quarter_id'].astype(int).astype(str)
        df['quarter_id'] = df['quarter_id'].astype(int)

        df.drop(columns=[
            'value_between_companies', 'value_new_investments',
            'value_re_investments'
        ],
                inplace=True)

        base = ['ent_id', 'year', 'quarter_id']
        df_final = pd.DataFrame()
        for option in [
                'between_companies', 'new_investments', 're_investments'
        ]:
            temp = df[base +
                      ['count_{}'.format(option), 'value_{}_c'.format(option)
                       ]].copy()
            temp.columns = ['ent_id', 'year', 'quarter_id', 'count', 'value_c']
            temp.dropna(subset=['value_c'], inplace=True)
            temp['investment_type'] = option
            df_final = df_final.append(temp)
        df = df_final.copy()

        df['investment_type'].replace(INVESTMENT_TYPE, inplace=True)

        temp = pd.DataFrame()
        for ent in list(df['ent_id'].unique()):
            temp = temp.append(
                validate_category(df.loc[(df['ent_id'] == ent)],
                                  'investment_type', 'value_c', 'c'))

        df = temp.copy()
        temp = pd.DataFrame()
        df = df.loc[df['value_c'].astype(str).str.lower() != 'c'].copy()
        df['value_c'] = df['value_c'].astype(float)

        return df