def run_step(self, prev, params): df = prev print('Year YearInvestment: fdi_year_investment_industry') if 'Unnamed: 0' in df.columns: df.drop('Unnamed: 0', axis=1, inplace=True) pk_id = [ x for x in df.columns if ('id' in x) & ('country' not in x) & ('ent_id' not in x) ][0] df = df.loc[df[pk_id] != 'Total general'].copy() split = df[pk_id].str.split(' ', n=1, expand=True) df[pk_id] = split[0] df[pk_id] = df[pk_id].astype(int) df['value_c'] = df['value_c'].astype(str).str.lower() temp = pd.DataFrame() for country in list(df['investment_type'].unique()): temp = temp.append( validate_category(df.loc[(df['investment_type'] == country)], pk_id, 'value_c', 'c')) df = temp.copy() temp = pd.DataFrame() level = ['sector_id', 'subsector_id', 'industry_group_id'] for i in level: if i != pk_id: df[i] = 0 df['sector_id'].replace(SECTOR_REPLACE, inplace=True) df['sector_id'] = df['sector_id'].astype(str) df = df.loc[df['value_c'] != 'c'].copy() df.drop(columns=['value_c'], inplace=True) df['investment_type'].replace(INVESTMENT_TYPE, inplace=True) df[['year', 'value', 'count', 'investment_type', 'industry_group_id', 'subsector_id']] = \ df[['year', 'value', 'count', 'investment_type', 'industry_group_id', 'subsector_id']].astype(float) return df
def run_step(self, prev, params): df = prev print('Investmen STEP: fdi_quarter_industry_investment') pk_id = params.get('pk') df = df.loc[df[pk_id] != 'Total general'].copy() df['quarter_id'] = ( df['year'].astype(int).astype(str) + df['quarter_id'].astype(int).astype(str)).astype(int) df.drop(columns=['year'], inplace=True) split = df[pk_id].str.split(' ', n=1, expand=True) df[pk_id] = split[0] df[pk_id] = df[pk_id].astype(int) level = ['sector_id', 'subsector_id', 'industry_group_id'] for i in level: if i != pk_id: df[i] = 0 df['sector_id'].replace(SECTOR_REPLACE, inplace=True) df['sector_id'] = df['sector_id'].astype(str) df['value_c'] = df['value_c'].astype(str).str.lower() temp = pd.DataFrame() for investment_type in list(df['investment_type'].unique()): temp = temp.append( validate_category( df.loc[(df['investment_type'] == investment_type)], pk_id, 'value_c', 'c')) df = temp.copy() temp = pd.DataFrame() df = df.loc[df['value_c'] != 'c'].copy() df['investment_type'].replace(INVESTMENT_TYPE, inplace=True) df[['value', 'count', 'investment_type', 'quarter_id', 'industry_group_id', 'subsector_id']] = \ df[['value', 'count', 'investment_type', 'quarter_id', 'industry_group_id', 'subsector_id']].astype(float) return df
def run_step(self, prev, params): df = prev print('Year YearQuarter: fdi_quarter_industry') pk_id = [ x for x in df.columns if ('id' in x) & ('country' not in x) & ('ent_id' not in x) ][0] df = df.loc[df[pk_id] != 'Total general'].copy() df['quarter_id'] = ( df['year'].astype(int).astype(str) + df['quarter_id'].astype(int).astype(str)).astype(int) df.drop(columns=['year'], inplace=True) split = df[pk_id].str.split(' ', n=1, expand=True) df[pk_id] = split[0] df[pk_id] = df[pk_id].astype(int) df['value_c'] = df['value_c'].astype(str).str.lower() """temp = pd.DataFrame() for country in list(df['ent_id'].unique()): temp = temp.append(validate_category(df.loc[(df['ent_id'] == country)], pk_id, 'value_c', 'c')) df = temp.copy() temp = pd.DataFrame()""" df = validate_category(df, pk_id, 'value_c', 'c') level = ['sector_id', 'subsector_id', 'industry_group_id'] for i in level: if i != pk_id: df[i] = 0 df['sector_id'].replace(SECTOR_REPLACE, inplace=True) df['sector_id'] = df['sector_id'].astype(str) df = df.loc[df['value_c'] != 'c'].copy() df = df.loc[df['value_c'] != 'false'].copy() df[['quarter_id', 'value_c', 'count', 'industry_group_id', 'subsector_id']] = \ df[['quarter_id', 'value_c', 'count', 'industry_group_id', 'subsector_id']].astype(float) return df
def run_step(self, prev, params): data = prev df = pd.read_excel(data, sheet_name='2.4') df.columns = [ norm(x.strip().lower().replace(' ', '_').replace('-', '_').replace( '%', 'perc')) for x in df.columns ] print(df.columns) df = df.loc[~df['entidad_federativa'].str.contains('Total')].copy() # get end_id dimension dim_geo = get_dimensions()[0] df['entidad_federativa'].replace(dict( zip(dim_geo['ent_name'], dim_geo['ent_id'])), inplace=True) df.columns = [ 'ent_id', 'year', 'quarter_id', 'value_between_companies', 'value_new_investments', 'value_re_investments', 'count_between_companies', 'count_new_investments', 'count_re_investments', 'value_between_companies_c', 'value_new_investments_c', 'value_re_investments_c' ] df['quarter_id'] = df['year'].astype(int).astype( str) + df['quarter_id'].astype(int).astype(str) df['quarter_id'] = df['quarter_id'].astype(int) df.drop(columns=[ 'value_between_companies', 'value_new_investments', 'value_re_investments' ], inplace=True) base = ['ent_id', 'year', 'quarter_id'] df_final = pd.DataFrame() for option in [ 'between_companies', 'new_investments', 're_investments' ]: temp = df[base + ['count_{}'.format(option), 'value_{}_c'.format(option) ]].copy() temp.columns = ['ent_id', 'year', 'quarter_id', 'count', 'value_c'] temp.dropna(subset=['value_c'], inplace=True) temp['investment_type'] = option df_final = df_final.append(temp) df = df_final.copy() df['investment_type'].replace(INVESTMENT_TYPE, inplace=True) temp = pd.DataFrame() for ent in list(df['ent_id'].unique()): temp = temp.append( validate_category(df.loc[(df['ent_id'] == ent)], 'investment_type', 'value_c', 'c')) df = temp.copy() temp = pd.DataFrame() df = df.loc[df['value_c'].astype(str).str.lower() != 'c'].copy() df['value_c'] = df['value_c'].astype(float) return df