def run_step(self, prev, params): logger.info('Municipality - Sector Step...') df_ent_sec, df_ent_sub, df_ent_ram, base = prev df = base.copy() df = df.loc[(df['CODIGO'].str.strip().str.len() == 2) | (df['CODIGO'] == '31-33') | (df['CODIGO'].str.strip() == '48-49')].copy() df = df.loc[df['ID_ESTRATO'].isna()].copy() df = df.loc[~df['MUNICIPIO'].isna()].copy() df = df.loc[~df['CODIGO'].isna()].copy() df['sector_id'] = df['CODIGO'].astype(str).str.strip() df['mun_id'] = (df['ENTIDAD'].astype(str) + df['MUNICIPIO'].astype(str).str.zfill(3)).astype(int) df.drop(columns=['ID_ESTRATO', 'CODIGO', 'MUNICIPIO', 'ENTIDAD'], inplace=True) df = fill_level(df, FILL_COLUMNS) # Municipio-Sector df['level'] = 4 df_mun_sec = df.copy() return df_ent_sec, df_ent_sub, df_ent_ram, df_mun_sec
def run_step(self, prev, params): logger.info("Tidying up DataFrame...") df = prev for i in range(1,9): df[i]["region"] = df[i]["region"].str.title() df[i]["data_origin"] = "INEI" if i in [1,2,3,4] else "ENE" if i in [1,2,3,4]: df[i] = df[i].rename(columns={"censo":"year"}) df[i]["year"] = df[i]["year"].str[-4:] df[i]["year"] = df[i]["year"].astype(int) else: df[i]["year"] = 2017 response_col = 2 if i in [1,2,3,4] else 1 df[i]["variable"] = VARIABLE_DICT[i] df[i]["response"] = df[i].iloc[:, response_col] df[i] = df[i].rename(columns={"valor_porcentaje": "percentage"}) df[i] = df[i][["region", "data_origin", "year", "variable", "response", "percentage"]] df_list = [df[i] for i in range(1,9)] df = pd.concat(df_list, ignore_index=True) df.to_csv("data_temp/tidy_file.csv", index=False, quoting=csv.QUOTE_NONNUMERIC) return 0
def run_step(self, prev, params): logger.info('State - Subsector Step...') df_ent_sec, base = prev df = base.copy() df = df.loc[df['CODIGO'].str.strip().str.len() == 3].copy() df = df.loc[df['ID_ESTRATO'].isna()].copy() df = df.loc[df['MUNICIPIO'].isna()].copy() df = df.loc[~df['CODIGO'].isna()].copy() df['subsector_id'] = df['CODIGO'].astype(int) df['ent_id'] = df['ENTIDAD'].astype(int) df.drop(columns=['ID_ESTRATO', 'CODIGO', 'MUNICIPIO', 'ENTIDAD'], inplace=True) df = fill_level(df, FILL_COLUMNS) # Entidad-Subsector df['level'] = 2 df_ent_sub = df.copy() return df_ent_sec, df_ent_sub, base
def run_step(self, prev, params): logger.info("Applying Transformations...") df = pd.read_csv(prev) df = df[2306:] # Country Dim dim = pd.read_csv("data_temp/dim_country.csv") dim_dict = { k: v for (k, v) in zip(dim["country_code"], dim["country_id"]) } df["Country Code"] = df["Country Code"].map(dim_dict) # Tidying Up col_names = { "Year": "year", "Country Code": "country_id", "Value": "gdp_value" } df = df.rename(columns=col_names) df = df[["year", "country_id", "gdp_value"]] return df
def run_step(self, prev, params): logger.info('Read Step...') label = '' data = [] for ele in prev: if 'diccionario' in ele[1]['file']: label = ele[0] elif 'ce2019_nac' in ele[1]['file']: continue elif 'Indicators' in ele[1]['file']: continue else: data.append(ele[0]) t = pd.DataFrame() for ele in data: df = pd.read_csv(ele, engine='python', sep="[\t]*,[\t]*", header=0) df = df.reset_index() # column names labels = pd.read_csv(label) columns = list(labels.reset_index()['index']) df.drop(columns=['ENTIDAD'], inplace=True) df.columns = columns df.replace(' ', np.nan, inplace=True) t = t.append(df) df = t.copy() t = pd.DataFrame() df['year'] = 2019 return df
def run_step(self, prev, params): logger.info("Creating Fact Table...") df = pd.read_csv("data_temp/tidy_file.csv") df_reg = pd.read_csv("data_output/tic_dim_region.csv") region_map = { k: v for (k, v) in zip(df_reg["region_name"], df_reg["region_id"]) } df["region_id"] = df["region"].map(region_map) origin_map = {"INEI": 1, "ENE": 0} df["data_origin_id"] = df["data_origin"].map(origin_map) df_var = pd.read_csv("data_output/tic_dim_variable.csv") variable_map = { k: v for (k, v) in zip(df_var["combined"], df_var["response_id"]) } df["combined"] = df["variable"] + "|" + df["response"] df["response_id"] = df["combined"].map(variable_map) df = df[[ "region_id", "data_origin_id", "response_id", "year", "percentage" ]] return df
def run_step(self, prev, params): logger.info("Opening files from source folder...") df = { i: pd.read_excel("data_source/chart{}.xlsx".format(i)) for i in range(1, 9) } return df
def run_step(self, prev, params): logger.info("Creating Region Dimension...") df = pd.read_csv("data_temp/tidy_file.csv") region_list = list(df["region"].unique()) df_region = pd.DataFrame({"region_id": list(range(len(region_list))), "region_name": sorted(region_list)}) return df_region
def run_step(self, prev, params): logger.info("Transforming {}".format(params.get("filename"))) df = prev column_names = { "stateName": "state_name", "weekEnded": "week_ended", "InitialClaims": "initial_claims", "ReflectingWeekEnded": "reflecting_week_end", "ContinuedClaims": "continued_claims", "CoveredEmployment": "covered_employment", "InsuredUnemploymentRate": "insured_unemployment_rate" } df = df.rename(columns=column_names) # US Mapping fips_map = us.states.mapping("name", "fips") df["fips_code"] = "04000US" + df["state_name"].map(fips_map) # Columns order and type df = df[[ "week_ended", "reflecting_week_end", "fips_code", "state_name", "initial_claims", "continued_claims", "covered_employment", "insured_unemployment_rate" ]] for c in ["week_ended", "reflecting_week_end"]: df[c] = pd.to_datetime(df[c], format="%m/%d/%Y") for c in ["initial_claims", "continued_claims", "covered_employment"]: df[c] = df[c].str.replace(",", "").astype("int64") df["insured_unemployment_rate"] = df[ "insured_unemployment_rate"].astype("float64") # Sort by latest date df = df.sort_values(by=["reflecting_week_end"], ascending=False) # Append to CVS alabama_bool = params.get("filename") == "A-M.xml" df.to_csv("./unemployment_output/partial_output.csv", header=alabama_bool, index=False, mode="a", quoting=csv.QUOTE_NONNUMERIC) return df
def run_step(self, prev_result, params): logger.info('Running Read step...') df_dict = df_dict = pd.read_excel(prev_result, sheet_name=None, skiprows=2) df = pd.DataFrame() for key, df_temp in df_dict.items(): df_temp = df_temp[(df_temp.Periodo.str[0] == '1') | (df_temp.Periodo.str[0] == '2')] df_temp = df_temp.melt(id_vars='Periodo', var_name='name').sort_values(by='Periodo') df = pd.concat([df, df_temp]) return df
def run_step(self, prev_result, params): logger.info('Running expenses type dimension step...') results = self.get_pipeline_results_ref() df = results["global_df"] df = df[['desc_tipogasto']] df = df.groupby(['desc_tipogasto' ]).size().to_frame(name="count").reset_index() df['desc_tipogasto_mod'] = df['desc_tipogasto'].apply( lambda x: unidecode.unidecode(x.lower())) df.drop_duplicates(subset='desc_tipogasto_mod', keep='first', inplace=True) df.drop(columns=['count'], inplace=True) df.insert(0, 'id_tipogasto', range(1, 1 + len(df))) df['exp_type_des_en'] = df['desc_tipogasto'].apply( lambda x: GoogleTranslator(source='auto', target='en').translate(x )) df['exp_type_des_en'] = df['exp_type_des_en'].apply( lambda x: x.capitalize()) df['desc_tipogasto'] = df['desc_tipogasto'].apply( lambda x: speller.correction(x)) df['desc_tipogasto'] = df['desc_tipogasto'].apply( lambda x: x.capitalize()) df.rename(columns={ 'desc_tipogasto': 'exp_type_des', 'id_tipogasto': 'exp_type_id' }, inplace=True) df['exp_type_id'] = df['exp_type_id'].astype(str) df = df[[ 'exp_type_id', 'exp_type_des', 'exp_type_des_en', 'desc_tipogasto_mod' ]] return df
def run_step(self, prev_result, params): logger.info('Running Xform step...') df = prev_result df['month_id'] = df.Periodo.str.replace('/', '') df['trade_flow_id'] = df.name.apply( lambda x: 1 if 'Importaciones' in x.split() else 2) df['petroleum'] = df.name.apply(lambda x: 1 if 'Petroleras' in x.split() else 2) df['country_name'] = df.name.apply(lambda x: x.split(' > ')[-1]) dim_country = query_to_df(self.connector, 'select * from dim_shared_country') dim_country = dim_country[['country_name_es', 'iso3']] iso3_dict_items_list = list(iso3_dict.items()) df_temp = pd.DataFrame(iso3_dict_items_list, columns=['country_name_es', 'iso3']) dim_country = pd.concat([dim_country, df_temp]) for string in strings_to_remove: df.country_name = df.country_name.str.replace(string, '', 1) df = df[(~df.country_name.str.contains('total')) & ~df.country_name.str.contains('Total')] df = df.merge(dim_country[[ 'iso3', 'country_name_es' ]].drop_duplicates(subset=['country_name_es']), left_on='country_name', right_on='country_name_es', how='left') df = df[['month_id', 'iso3', 'trade_flow_id', 'petroleum', 'value']] df = df[df.value != 'C'] df.value = df.value.astype(float) df.month_id = df.month_id.astype(int) df.value = df.value.fillna(0) df.value = df.value * 1000 return df
def run_step(self, prev, params): logger.info('Join Step...') df_ent_sec, df_ent_sub, df_ent_ram, df_mun_sec = prev df = pd.DataFrame() for _df in [df_ent_sec, df_ent_sub, df_ent_ram, df_mun_sec]: df = df.append(_df, sort=False) df[list(df.columns[df.columns != 'sector_id'])] = df[list( df.columns[df.columns != 'sector_id'])].astype(float) df['sector_id'] = df['sector_id'].astype(str) df[['UE', 'level']] = df[['UE', 'level']].astype(int) df['year'] = 2019 df.columns = df.columns.str.lower() return df
def run_step(self, prev, params): filename = "Wakanda_{}_{}.csv".format(params.get("trade_flow"), params.get("year")) logger.info("Transforming {}...".format(filename)) df = prev # 1. Change order of columns cols = list(df.columns) cols = [cols[-1]] + cols[:-1] df = df[cols] # 2. Melt Months month_list = [m.upper() for m in calendar.month_name[1:]] df = df.melt(id_vars=["ORIGIN_OR_DESTINATION"], value_vars=month_list, var_name="month", value_name="total") # 3. Change column names df = df.rename(columns={"ORIGIN_OR_DESTINATION": "country"}) # 4. Drop NaN values df = df.dropna() # 5. Map country names to ISO3 country_df = pd.read_csv("resources/country_iso3_codes.csv") country_map = {k:v for (k,v) in zip(country_df["country_name"], country_df["country_iso3"])} df["country"] = df["country"].map(country_map).str.lower() # 6. Map month names to numeric month_list = [m.upper() for m in calendar.month_name] month_map = {month_list[i]: i for i in range(1,13)} df["month"] = df["month"].map(month_map) # 7. Create trade flow column flow_map = {"IMP": 1, "EXP": 2} df["trade_flow"] = flow_map[params["trade_flow"]] # 8. Create year column and time_id column df["year"] = params["year"] df["time_id"] = (df["year"] + df["month"].astype(str).str.zfill(2)).astype(int) df = df[["time_id", "country", "trade_flow", "total"]] # 9. Print the DataFrame print(df.head()) print(df.isnull().any()) return df
def run_step(self, prev, params): logger.info("Creating Variable Dimension...") df = pd.read_csv("data_temp/tidy_file.csv") df_var = df[["variable", "response"]].copy() df_var["combined"] = df_var["variable"] + "|" + df_var["response"] df_var = df_var[["combined"]] df_var = df_var.drop_duplicates().reset_index(drop=True) df_var["response_id"] = df_var.index df_var["variable_name"] = df_var["combined"].str.split("|").str[0] df_var["response_name"] = df_var["combined"].str.split("|").str[1] df_var = df_var[["response_id", "variable_name", "response_name", "combined"]] return df_var
def run_step(self, prev, params): logger.info("Creating Country Dimension...") dim = pd.read_csv("https://datahub.io/core/gdp/r/gdp.csv") dim = dim[2306:] dim = dim[["Country Name", "Country Code"]].drop_duplicates().reset_index(drop=True) dim["country_id"] = dim.index + 1 dim = dim.rename(columns={ "Country Name": "country_name", "Country Code": "country_code" }) dim = dim[["country_id", "country_name", "country_code"]] dim.to_csv("data_temp/dim_country.csv", index=False, quoting=csv.QUOTE_NONNUMERIC) return dim
def run_step(self, prev, params): logger.info("Processing {}".format(params.get("filename"))) with open("unemployment_data/{}".format(params["filename"]), "r") as file: xml = file.read() xml_dict = xmltodict.parse(xml) json_data = json.dumps(xml_dict) json_dict = json.loads(json_data) data = json_dict['r539cyState']['week'] data_rows = {k: [] for k in data[0].keys()} for i in range(len(data)): for k in data[i].keys(): data_rows[k].append(data[i][k]) df = pd.DataFrame({k: data_rows[k] for k in data_rows.keys()}) return df
def run_step(self, prev_result, params): logger.info('Running Xform step...') df = prev_result df['month_id'] = df.Periodo.str.replace('/', '') df['trade_flow_id'] = df.name.apply( lambda x: 1 if 'Importaciones' in x.split() else 2) df = df[df.value != 'C'] df.value = df.value.fillna('0') df.value = df.value.astype(int) df['section'] = df.name.str.extract(r'(Sección\s\w{1,5})') df['chapter'] = df.name.str.extract(r'(Capítulo(|s)\s\d{1,3})')[0] df.chapter = df.chapter.str.replace('s', '') df.reset_index(drop=True, inplace=True) df = df[df['name'].str.contains('Total capítulo')] df['section_id'] = df.section.apply( lambda x: str(roman.fromRoman(x.split(' ')[-1]))) df['chapter_id'] = df.chapter.apply(lambda x: x.split(' ')[-1]) df['hs2_id'] = df.section_id + df.chapter_id df.drop(['Periodo', 'section', 'chapter'], axis=1, inplace=True) df = df[['month_id', 'trade_flow_id', 'hs2_id', 'value']] df.value = df.value.astype(float) df.month_id = df.month_id.astype(int) df.hs2_id = df.hs2_id.astype(int) df.value = df.value.fillna(0) df.value = df.value * 1000 return df
def run_step(self, prev_result, params): logger.info('Running function dimension step...') df = prev_result[[ 'gpo_funcional', 'desc_gpo_funcional', 'id_funcion', 'desc_funcion', 'id_subfuncion', 'desc_subfuncion' ]] df = df.groupby([ 'gpo_funcional', 'desc_gpo_funcional', 'id_funcion', 'desc_funcion', 'id_subfuncion', 'desc_subfuncion' ]).size().to_frame(name="count").reset_index() df['desc_gpo_funcional_mod'] = df['desc_gpo_funcional'].apply( lambda x: unidecode.unidecode(x.lower())) # df.drop_duplicates(subset='desc_gpo_funcional_mod', keep='first', inplace=True) df['desc_funcion_mod'] = df['desc_funcion'].apply( lambda x: unidecode.unidecode(x.lower())) # df.drop_duplicates(subset='desc_funcion_mod', keep='first', inplace=True) df['desc_subfuncion_mod'] = df['desc_subfuncion'].apply( lambda x: unidecode.unidecode(x.lower())) df.drop_duplicates(subset='desc_subfuncion_mod', keep='first', inplace=True) #df.drop(columns=['count', 'desc_gpo_funcional_mod', 'desc_funcion_mod','desc_subfuncion_mod'],inplace=True) df.drop(columns=['count'], inplace=True) df['desc_subfuncion_en'] = df['desc_subfuncion'].apply( lambda x: GoogleTranslator(source='auto', target='en').translate(x )) df['desc_funcion_en'] = df['desc_funcion'].apply( lambda x: GoogleTranslator(source='auto', target='en').translate(x )) df['desc_gpo_funcional_en'] = df['desc_gpo_funcional'].apply( lambda x: GoogleTranslator(source='auto', target='en').translate(x )) df['gpo_funcional'] = df['gpo_funcional'].astype(str) df['id_funcion'] = df['id_funcion'].astype(str) df['id_subfuncion'] = df['id_subfuncion'].astype(str) df['gpo_funcional'] = df['gpo_funcional'].apply( lambda x: x.split('.')[0]) df['id_funcion'] = df['id_funcion'].apply( lambda x: '0' + x.split('.')[0]) df['id_subfuncion'] = df['id_subfuncion'].apply( lambda x: '0' + x.split('.')[0]) df['function_id'] = df['gpo_funcional'] + df['id_funcion'] df['subfunction_id'] = df['gpo_funcional'] + df['id_funcion'] + df[ 'id_subfuncion'] df.rename(columns={ 'gpo_funcional': 'functional_group_id', 'desc_gpo_funcional': 'functional_group_desc_es', 'desc_funcion': 'function_des_es', 'desc_subfuncion': 'subfunction_des_es', 'desc_gpo_funcional_en': 'functional_group_desc_en', 'desc_funcion_en': 'function_des_en', 'desc_subfuncion_en': 'subfunction_des_en' }, inplace=True) df.drop(columns=['id_funcion', 'id_subfuncion'], inplace=True) df = df[[ 'functional_group_id', 'functional_group_desc_es', 'functional_group_desc_en', 'function_id', 'function_des_es', 'function_des_en', 'subfunction_id', 'subfunction_des_es', 'subfunction_des_en', 'desc_subfuncion_mod' ]] df['functional_group_desc_es'] = df['functional_group_desc_es'].apply( lambda x: speller.correction(x)) df['function_des_es'] = df['function_des_es'].apply( lambda x: speller.correction(x)) df['subfunction_des_es'] = df['subfunction_des_es'].apply( lambda x: speller.correction(x)) df['functional_group_desc_en'] = df['functional_group_desc_en'].apply( lambda x: x.capitalize()) df['function_des_en'] = df['function_des_en'].apply( lambda x: x.capitalize()) df['subfunction_des_en'] = df['subfunction_des_en'].apply( lambda x: x.capitalize()) df['functional_group_desc_es'] = df['functional_group_desc_es'].apply( lambda x: x.capitalize()) df['function_des_es'] = df['function_des_es'].apply( lambda x: x.capitalize()) df['subfunction_des_es'] = df['subfunction_des_es'].apply( lambda x: x.capitalize()) return df
def run_step(self, prev_result, params): logger.info('Running chapter-concept dimension step...') results = self.get_pipeline_results_ref() df = results["global_df"] df = df[[ 'id_capitulo', 'desc_capitulo', 'id_concepto', 'desc_concepto' ]] df = df.groupby( ['id_capitulo', 'desc_capitulo', 'id_concepto', 'desc_concepto']).size().to_frame(name="count").reset_index() df['desc_capitulo_mod'] = df['desc_capitulo'].apply( lambda x: unidecode.unidecode(x.lower())) # df.drop_duplicates(subset='desc_capitulo_mod', keep='first', inplace=True) df['desc_concepto_mod'] = df['desc_concepto'].apply( lambda x: unidecode.unidecode(x.lower())) df.drop_duplicates(subset='desc_concepto_mod', keep='first', inplace=True) #df.drop(columns=['count', 'desc_capitulo_mod', 'desc_concepto_mod'],inplace=True) df.drop(columns=['count'], inplace=True) df['chapter_des_en'] = df['desc_capitulo'].apply( lambda x: GoogleTranslator(source='auto', target='en').translate(x )) df['chapter_des_en'] = df['chapter_des_en'].apply( lambda x: x.capitalize()) df['desc_capitulo'] = df['desc_capitulo'].apply( lambda x: speller.correction(x)) df['desc_capitulo'] = df['desc_capitulo'].apply( lambda x: x.capitalize()) df['concept_des_en'] = df['desc_concepto'].apply( lambda x: GoogleTranslator(source='auto', target='en').translate(x )) df['concept_des_en'] = df['concept_des_en'].apply( lambda x: x.capitalize()) df['desc_concepto'] = df['desc_concepto'].apply( lambda x: x.capitalize()) df.rename(columns={ 'desc_capitulo': 'chapter_des', 'id_capitulo': 'chapter_id', 'desc_concepto': 'concept_des', 'id_concepto': 'concept_id' }, inplace=True) df.loc[df['concept_des'] == 'No registrado', [ 'chapter_id', 'chapter_des', 'chapter_des_en', 'concept_id', 'concept_des', 'concept_des_en' ]] = [ 0, 'No registrado', 'Not registered', 999, 'No registrado', 'Not registered' ] df['chapter_id'] = df['chapter_id'].astype(str) df['concept_id'] = df['concept_id'].astype(str) df = df[[ 'chapter_id', 'chapter_des', 'chapter_des_en', 'concept_id', 'concept_des', 'concept_des_en', 'desc_concepto_mod' ]] return df
def run_step(self, prev_result, params): logger.info('Running read step...') files_list = prev_result df = pd.DataFrame() for file_ in files_list: temp_df = pd.read_excel(file_[0]) temp_df.columns = temp_df.columns.str.lower() if file_[1]['filename'] in [ 'pef_ac01_avance_2t_2020', 'pef_ac01_avance_2t_2021', 'pef_ac01_avance_3t_2020', 'pef_ac01_avance_1t_2021', 'pef_ac01_avance_1t_2020' ]: temp_df = temp_df[columns_required_1_2] temp_df.columns = columns_required_1 temp_df['quarter_id'] = files[file_[1]['filename'] + '.xlsx'] df = pd.concat([df, temp_df]) elif file_[1]['filename'] in [ 'ac01_avance_2t19', 'ac01_avance_marzo19', 'pef_ac01_avance_4t_2020' ]: temp_df = temp_df[columns_required_1_1] temp_df.columns = columns_required_1 temp_df['quarter_id'] = files[file_[1]['filename'] + '.xlsx'] df = pd.concat([df, temp_df]) elif file_[1]['filename'] in ['pef_ac01_avance_2t_2016']: temp_df = temp_df[columns_required_2] temp_df.columns = columns_required_1 temp_df['quarter_id'] = files[file_[1]['filename'] + '.xlsx'] df = pd.concat([df, temp_df]) else: temp_df = temp_df[columns_required_1] temp_df['quarter_id'] = files[file_[1]['filename'] + '.xlsx'] df = pd.concat([df, temp_df]) df[['monto_pagado', 'monto_aprobado']] = df[['monto_pagado', 'monto_aprobado']].fillna(value=0) df.dropna(inplace=True) df['desc_gpo_funcional'] = df['desc_gpo_funcional'].apply( lambda x: x.capitalize()) df['desc_funcion'] = df['desc_funcion'].apply(lambda x: x.capitalize()) df['desc_subfuncion'] = df['desc_subfuncion'].apply( lambda x: x.capitalize()) df['desc_ramo'] = df['desc_ramo'].apply(lambda x: x.capitalize()) df['desc_tipogasto'] = df['desc_tipogasto'].apply( lambda x: x.capitalize()) df['desc_capitulo'] = df['desc_capitulo'].apply( lambda x: x.capitalize()) df['desc_concepto'] = df['desc_concepto'].apply( lambda x: x.capitalize()) df = df[df.id_entidad_federativa != ' '] #df['id_entidad_federativa'].replace({'33','35'}, inplace=True) # CAMBIAR ESTO NO FUNCIONO!!! PQ ES UN INT NO UN STRING df['id_entidad_federativa'] = df['id_entidad_federativa'].apply( lambda x: 35 if x == 33 else x) df['id_entidad_federativa'] = df['id_entidad_federativa'].astype(int) df['monto_pagado'] = df['monto_pagado'].astype(float) return df
def run_step(self, prev, params): logger.info("TransformStep") df = prev return df
def run_step(self, prev, params): logger.info("OpenStep") return df
def run_step(self, prev_result, params): logger.info('Running fact table step...') results = self.get_pipeline_results_ref() df = results["global_df"] dim_fun = results["dim_fun"] dim_cap = results["dim_cap"] dim_dep = results["dim_dep"] dim_exp = results["dim_exp"] df['desc_subfuncion_mod'] = df['desc_subfuncion'].apply( lambda x: unidecode.unidecode(x.lower())) df['desc_concepto_mod'] = df['desc_concepto'].apply( lambda x: unidecode.unidecode(x.lower())) df['desc_ramo_mod'] = df['desc_ramo'].apply( lambda x: unidecode.unidecode(x.lower())) df['desc_tipogasto_mod'] = df['desc_tipogasto'].apply( lambda x: unidecode.unidecode(x.lower())) df = df.merge(dim_fun.drop_duplicates(subset=['subfunction_des_es']), left_on="desc_subfuncion_mod", right_on="desc_subfuncion_mod", how="left") df = df.merge(dim_cap.drop_duplicates(subset=['concept_des']), left_on="desc_concepto_mod", right_on="desc_concepto_mod", how="left") df = df.merge(dim_dep.drop_duplicates(subset=['department_des']), left_on="desc_ramo_mod", right_on="desc_ramo_mod", how="left") df = df.merge(dim_exp.drop_duplicates(subset=['exp_type_des']), left_on="desc_tipogasto_mod", right_on="desc_tipogasto_mod", how="left") df.drop(columns=[ 'desc_ramo', 'id_ramo', 'desc_tipogasto', 'id_tipogasto', 'gpo_funcional', 'desc_gpo_funcional', 'id_funcion', 'desc_funcion', 'id_subfuncion', 'desc_subfuncion', 'id_capitulo', 'desc_capitulo', 'id_concepto', 'desc_concepto', 'functional_group_desc_es', 'functional_group_desc_en', 'function_des_es', 'function_des_en', 'subfunction_des_es', 'subfunction_des_en', 'chapter_des', 'chapter_des_en', 'concept_des', 'concept_des_en', 'entidad_federativa', 'desc_subfuncion_mod', 'desc_concepto_mod', 'desc_ramo_mod', 'desc_tipogasto_mod' ], inplace=True) df.rename( columns={ 'monto_pagado': 'amount_paid', 'monto_aprobado': 'amount_approved', 'id_entidad_federativa': 'ent_id', # 'id_ramo':'department_id', # 'id_tipogasto':'exp_type_id' }, inplace=True) df = df[[ 'quarter_id', 'ent_id', 'functional_group_id', 'function_id', 'subfunction_id', 'department_id', 'exp_type_id', 'chapter_id', 'concept_id', 'amount_paid', 'amount_approved' ]] df['amount_paid'] = df['amount_paid'].fillna(0) df['amount_approved'] = df['amount_approved'].fillna(0) df['amount_paid'] = df['amount_paid'].apply(lambda x: x == 0 if x == ' ' else x) df['amount_approved'] = df['amount_approved'].apply(lambda x: x == 0 if x == ' ' else x) df['quarter_id'] = df['quarter_id'].astype(int) df['department_id'] = df['department_id'].astype(str) df['exp_type_id'] = df['exp_type_id'].astype(str) df.dropna(subset=[ 'quarter_id', 'ent_id', 'functional_group_id', 'function_id', 'subfunction_id', 'department_id', 'exp_type_id', 'chapter_id', 'concept_id' ], inplace=True) return df
def run_step(self, prev, params): filename = "Wakanda_{}_{}.csv".format(params.get("trade_flow"), params.get("year")) logger.info("Opening {}...".format(filename)) df = pd.read_csv("data/" + filename) return df
def run_step(self, prev, params): logger.info("TransformStep...") result = prev return result