]] df = pd.concat([df, missing]) df = df[~df.continent_code.isnull()].reset_index(drop=True) df["level"] = "country" df = df.rename( columns={ "code_col": "code", "name_es_col": "name_es", "continent_code": "parent_code", }) assert df.loc[6, "name"] is pd.np.nan df.loc[6, "name"] = u"Netherlands Antilles" regions = pd.read_table( "../Mexico/in/Mexico Country codes - continents - Continents - Regions.tsv", encoding="utf-8", ) df = pd.concat([df, regions]).reset_index(drop=True) h = Hierarchy(["region", "country"]) parent_id_table = parent_code_table_to_parent_id_table(df, h) c = Classification(parent_id_table, h) c.to_csv("out/locations_international_dane.csv") # c.to_stata("out/locations_international_dane.dta")
four_digit["parent_code"] = four_digit.code.apply(lambda x: x[:2]) four_digit = four_digit.drop("community", axis=1) four_digit["level"] = "4digit" two_digit = hs4.iloc[1241:1339] two_digit["code"] = two_digit.code.astype(str).str.zfill(2) two_digit = two_digit.rename(columns={"community": "parent_code"}) two_digit["parent_code"] = two_digit.parent_code.astype(str).str.zfill(3) two_digit["level"] = "2digit" section = hs4.iloc[1339:].drop("community", axis=1) section["code"] = section.code.astype(str).str.zfill(3) section["parent_code"] = None section["level"] = "section" hs_clean = pd.concat([section, two_digit, four_digit]) hs_clean = hs_clean.reset_index(drop=True) h = Hierarchy(["section", "2digit", "4digit"]) hs_clean = parent_code_table_to_parent_id_table(hs_clean, h) c = Classification(hs_clean, h) # community = pd.read_table("in/hs4_community.tsv", encoding="utf-8") # hs4 = hs4.merge(community, left_on="community", right_on="code", how="inner") # weird bug where pandas infer_type was returning mixed instead of string c.table.code = c.table.code.astype(str) c.to_csv("out/hs92_atlas.csv") c.to_stata("out/hs92_atlas.dta")
# Replace trailing comma and space df.name_spanish = df.name_spanish.str.replace(", $", "") df.name_english = df.name_english.str.replace(", $", "") h = Hierarchy(["twodigit", "threedigit", "fourdigit", "fivedigit", "sixdigit"]) df.loc[df.code.str.len() == 2, "level"] = "twodigit" df.loc[df.code.str.len() == 3, "level"] = "threedigit" df.loc[df.code.str.len() == 4, "level"] = "fourdigit" df.loc[df.code.str.len() == 5, "level"] = "fivedigit" df.loc[df.code.str.len() == 6, "level"] = "sixdigit" spanish = df[["code", "level", "name_spanish"]] spanish.columns = ["code", "level", "name_es"] # make sure this is the hand-fixed version assert df.loc[304, "code"] == "31" df = df[["code", "name_english", "level"]] df.columns = ["code", "name", "level"] parent_code_table = ordered_table_to_parent_code_table(df, h) parent_id_table = parent_code_table_to_parent_id_table(parent_code_table, h) parent_id_table = parent_id_table.merge(spanish, on=["level", "code"]) c = Classification(parent_id_table, h) c.to_csv("out/industries_mexico_scian_2007.csv") c.to_stata("out/industries_mexico_scian_2007.dta")
# Drop the 5-digit level. names = names[names.level != "5digit"] hierarchy = hierarchy.iloc[:, 1:].drop_duplicates() fields = {"section": [], "2digit": [], "3digit": [], "4digit": []} h = Hierarchy(["section", "2digit", "3digit", "4digit"]) parent_code_table = repeated_table_to_parent_id_table(hierarchy, h, fields) parent_code_table.code = parent_code_table.code.astype(str) parent_code_table = parent_code_table.merge(names, on=["code", "level"]) # Sort by level order (not necessarily alphabetical) parent_code_table = sort_by_code_and_level(parent_code_table, h) parent_id_table = parent_code_table_to_parent_id_table(parent_code_table, h) parent_id_table["name"] = parent_id_table.name_en parent_id_table = parent_id_table[ [ "code", "name", "level", "name_en", "name_es", "name_short_en", "name_short_es", "parent_id", ] ]
if __name__ == "__main__": assert(len(sys.argv) == 3) file_name = sys.argv[1] new_file_prefix = sys.argv[2] df = pd.read_table(file_name, encoding="utf-16") df = parse_dane(df) df = df[~df.duplicated(["code"])] df = df.reset_index(drop=True) df.columns = ["name", "level", "code"] df.name = df.name.str.title() from classification import (parent_code_table_to_parent_id_table, Classification, Hierarchy, ordered_table_to_parent_code_table) h = Hierarchy(DANE_HIERARCHY) df = ordered_table_to_parent_code_table(df, h) df = parent_code_table_to_parent_id_table(df, h) c = Classification(df, h) # weird bug where pandas infer_type was returning mixed instead of string c.table.code = c.table.code.astype(str) c.to_csv(new_file_prefix + ".csv") c.to_stata(new_file_prefix + ".dta")
four_digit["parent_code"] = four_digit.code.apply(lambda x: x[:2]) four_digit = four_digit.drop("community", axis=1) four_digit["level"] = "4digit" two_digit = hs4.iloc[1241:1339] two_digit["code"] = two_digit.code.astype(str).str.zfill(2) two_digit = two_digit.rename(columns={"community": "parent_code"}) two_digit["parent_code"] = two_digit.parent_code.astype(str).str.zfill(3) two_digit["level"] = "2digit" section = hs4.iloc[1339:].drop("community", axis=1) section["code"] = section.code.astype(str).str.zfill(3) section["parent_code"] = None section["level"] = "section" hs_clean = pd.concat([section, two_digit, four_digit]) hs_clean = hs_clean.reset_index(drop=True) h = Hierarchy(["section", "2digit", "4digit"]) hs_clean = parent_code_table_to_parent_id_table(hs_clean, h) c = Classification(hs_clean, h) #community = pd.read_table("in/hs4_community.tsv", encoding="utf-8") #hs4 = hs4.merge(community, left_on="community", right_on="code", how="inner") # weird bug where pandas infer_type was returning mixed instead of string c.table.code = c.table.code.astype(str) c.to_csv("out/hs92_atlas.csv") c.to_stata("out/hs92_atlas.dta")