# Replace trailing comma and space df.name_spanish = df.name_spanish.str.replace(", $", "") df.name_english = df.name_english.str.replace(", $", "") h = Hierarchy(["twodigit", "threedigit", "fourdigit", "fivedigit", "sixdigit"]) df.loc[df.code.str.len() == 2, "level"] = "twodigit" df.loc[df.code.str.len() == 3, "level"] = "threedigit" df.loc[df.code.str.len() == 4, "level"] = "fourdigit" df.loc[df.code.str.len() == 5, "level"] = "fivedigit" df.loc[df.code.str.len() == 6, "level"] = "sixdigit" spanish = df[["code", "level", "name_spanish"]] spanish.columns = ["code", "level", "name_es"] # make sure this is the hand-fixed version assert df.loc[304, "code"] == "31" df = df[["code", "name_english", "level"]] df.columns = ["code", "name", "level"] parent_code_table = ordered_table_to_parent_code_table(df, h) parent_id_table = parent_code_table_to_parent_id_table(parent_code_table, h) parent_id_table = parent_id_table.merge(spanish, on=["level", "code"]) c = Classification(parent_id_table, h) c.to_csv("out/industries_mexico_scian_2007.csv") c.to_stata("out/industries_mexico_scian_2007.dta")
Classification) if __name__ == "__main__": sinco = pd.read_csv("in/SINCO_2011.csv", header=None, encoding="latin-1") sinco.columns = ["data"] sinco = sinco[~sinco.data.str.startswith("INEGI.")] sinco = sinco[~sinco.data.str.startswith(u"Clave Descripción")] for index, row in reversed( list(sinco[~sinco.data.str.match("^\d* ")].iterrows())): sinco.ix[index - 1] += (" " + sinco.ix[index]) sinco = sinco[sinco.data.str.match("^\d* ")] sinco = sinco.data.str.split(" ", 1).apply(pd.Series, 1) sinco.columns = ["code", "name"] sinco["level"] = sinco["code"].apply(lambda x: str(len(x)) + "digit") h = Hierarchy(["1digit", "2digit", "3digit", "4digit"]) parent_code_table = ordered_table_to_parent_code_table(sinco, h) parent_id_table = parent_code_table_to_parent_id_table( parent_code_table, h) c = Classification(parent_id_table, h) c.to_csv("out/occupations_sinco_2011.csv") c.to_stata("out/occupations_sinco_2011.dta")
if __name__ == "__main__": assert (len(sys.argv) == 3) file_name = sys.argv[1] new_file_prefix = sys.argv[2] df = pd.read_table(file_name, encoding="utf-16") df = parse_dane(df) df = df[~df.duplicated(["code"])] df = df.reset_index(drop=True) df.columns = ["name", "level", "code"] df.name = df.name.str.title() from classification import (parent_code_table_to_parent_id_table, Classification, Hierarchy, ordered_table_to_parent_code_table) h = Hierarchy(DANE_HIERARCHY) df = ordered_table_to_parent_code_table(df, h) df = parent_code_table_to_parent_id_table(df, h) c = Classification(df, h) # weird bug where pandas infer_type was returning mixed instead of string c.table.code = c.table.code.astype(str) c.to_csv(new_file_prefix + ".csv") c.to_stata(new_file_prefix + ".dta")
from classification import (Hierarchy, ordered_table_to_parent_code_table, parent_code_table_to_parent_id_table, Classification) if __name__ == "__main__": sinco = pd.read_csv("in/SINCO_2011.csv", header=None, encoding="latin-1") sinco.columns = ["data"] sinco = sinco[~sinco.data.str.startswith("INEGI.")] sinco = sinco[~sinco.data.str.startswith(u"Clave Descripción")] for index, row in reversed(list(sinco[~sinco.data.str.match("^\d* ")].iterrows())): sinco.ix[index - 1] += (" " + sinco.ix[index]) sinco = sinco[sinco.data.str.match("^\d* ")] sinco = sinco.data.str.split(" ", 1).apply(pd.Series, 1) sinco.columns = ["code", "name"] sinco["level"] = sinco["code"].apply(lambda x: str(len(x)) + "digit") h = Hierarchy(["1digit", "2digit", "3digit", "4digit"]) parent_code_table = ordered_table_to_parent_code_table(sinco, h) parent_id_table = parent_code_table_to_parent_id_table(parent_code_table, h) c = Classification(parent_id_table, h) c.to_csv("out/occupations_sinco_2011.csv") c.to_stata("out/occupations_sinco_2011.dta")