def calc_rca(ybc, year): rcas = pd.DataFrame() for geo_level in [2, 4, 8]: print "geo level:", geo_level ybc_data = ybc.reset_index() bra_criterion = ybc_data["bra_id"].map(lambda x: len(x) == geo_level) course_criterion = ybc_data["course_sc_id"].map(lambda x: len(x) == 5) ybc_data = ybc_data[bra_criterion & course_criterion] ybc_data = ybc_data[["bra_id", "course_sc_id", "students"]] ybc_data = ybc_data.pivot(index="bra_id", columns="course_sc_id", values="students") ybc_data_rca = growth.rca(ybc_data) ybc_data_rca = pd.DataFrame(ybc_data_rca.stack(), columns=["students_rca"]) if rcas.empty: rcas = ybc_data_rca else: rcas = pd.concat([rcas, ybc_data_rca]) rcas = rcas.replace(0, np.nan) rcas = rcas.dropna(how="all") rcas["year"] = int(year) rcas = rcas.set_index("year", append=True) rcas = rcas.swaplevel("year", "course_sc_id") rcas = rcas.swaplevel("year", "bra_id") ybc = ybc.merge(rcas, how="outer", left_index=True, right_index=True) return ybc
def get_ybi_rcas(ybi, geo_level): ybi = ybi.reset_index() cnae_criterion = ybi['cnae_id'].str.len() == 6 bra_criterion = ybi['bra_id'].str.len() == geo_level ybi = ybi[cnae_criterion & bra_criterion] ybi = ybi[["bra_id", "cnae_id", "wage"]] ybi = ybi.pivot(index="bra_id", columns="cnae_id", values="wage").fillna(0) rcas = growth.rca(ybi) rcas[rcas >= 1] = 1 rcas[rcas < 1] = 0 return rcas
def calc_rca(ybuc, year): ybc = ybuc.groupby(level=["year", "bra_id", "course_hedu_id"]).sum() ybc = ybc[["enrolled"]] ybc = ybc.reset_index() ybc = ybc.drop("year", axis=1) rcas = ybc.pivot(index="bra_id", columns="course_hedu_id", values="enrolled") rcas = growth.rca(rcas) rcas = pd.DataFrame(rcas.stack(), columns=["enrolled_rca"]) rcas = rcas.replace(0, np.nan) rcas = rcas.dropna(how="all") rcas["year"] = int(year) rcas = rcas.set_index("year", append=True) rcas = rcas.swaplevel("year", "course_hedu_id") rcas = rcas.swaplevel("year", "bra_id") return rcas
def get_rca_matrix(usable_countries, **kwargs): q = """ SELECT country_id, product_id, export_value FROM observatory_hs4_cpy WHERE year = {0} """.format(kwargs["year"]) mcp_table = sql.read_frame(q, kwargs["db"]) ''' transform flat list into multidimentional matrix rows = countries columns = products ''' mcp = mcp_table.pivot(index="country_id", columns="product_id", values="export_value") ''' Again, we need to get the intersection of what is in our list of countries (that match the atlas criteria) and the ones found in the data ''' country_list = set(mcp.index).intersection(usable_countries) ''' Now we can reindex the mcp matrix with the correct countries this funciton takes care of matching up the indexes that exist and discarding the ones that don't ''' mcp = mcp.reindex(index=list(country_list), fill_value=0).fillna(0) '''using growth stats library to calculate RCAs''' mcp = growth.rca(mcp) '''convert rcas to 0s and 1s''' mcp[mcp >= 1] = 1 mcp[mcp < 1] = 0 return mcp
cursor = db.cursor() # munic_totals = pd.read_sql("select bra_id, sum(enrolled) as enrolled from hedu_ybuc where year = 2012 group by year, bra_id", db, index_col="bra_id") # # df2 = pd.read_sql("select bra_id, name_en as course_name, sum(enrolled) as enrolled_by_course from hedu_ybuc, attrs_course_hedu where year = 2012 and bra_id = 'mg030000' and id = course_hedu_id group by year, bra_id, course_hedu_id", db, index_col="bra_id") # print df2.head() # totals = pd.read_sql("select course_hedu_id, sum(enrolled) as enrolled from hedu_ybuc where year = 2012 group by year, course_hedu_id", db, index_col="course_hedu_id") # print totals.head() # course_hedu_id = "811G01" # df = pd.read_sql("select year, bra_id, name_en, sum(enrolled) as enrolled from hedu_ybuc, attrs_bra where year = 2012 and course_hedu_id = %s and id = bra_id group by year, bra_id", db, params=[course_hedu_id], index_col="bra_id") # df["share"] = df["enrolled"] / munic_totals["enrolled"] # print df.sort("share", ascending=False).head() rcas = pd.read_sql("select bra_id, course_hedu_id, enrolled from hedu_ybuc where year = 2012 group by year, bra_id, course_hedu_id", db) rcas = rcas.pivot(index="bra_id", columns="course_hedu_id", values="enrolled") # rcas = rcas.pivot(index="course_hedu_id", columns="bra_id", values="enrolled") rcas = growth.rca(rcas) rcas = pd.DataFrame(rcas.stack(), columns=["enrolled_rca"]) course_names = pd.read_sql("select id as course_hedu_id, name_en from attrs_course_hedu", db, index_col="course_hedu_id") # print course_names.head() # print rcas.ix["212M02"].sort('enrolled_rca', ascending=False).head(10) # sys.exit() munic = rcas.ix["sp120509"] munic["course_name"] = course_names["name_en"] print munic.sort("enrolled_rca", ascending=False).head(10)
def rdo(ybi, yi, year, depths): rca_dist_opp = [] for geo_level in depths["bra"]: print "geo level:", geo_level ybi_data = ybi.reset_index() bra_criterion = ybi_data["bra_id"].map(lambda x: len(x) == geo_level) cnae_criterion = ybi_data["cnae_id"].map(lambda x: len(x) == 6) ybi_data = ybi_data[bra_criterion & cnae_criterion] # ybi_data = ybi_data.reindex(index=ybi_index) # ybi_data = ybi_data.drop(["year", "num_emp", "num_est", "wage_avg", "num_emp_est"], axis=1) ybi_data = ybi_data[["bra_id", "cnae_id", "wage"]] # ybi_data = ybi_data.unstack() # levels = ybi_data.columns.levels # labels = ybi_data.columns.labels # ybi_data.columns = levels[1][labels[1]] ''' RCAS ''' # ybi_data = ybi_data.pivot(index="bra_id", columns="cnae_id", values="wage").fillna(0) ybi_data = ybi_data.pivot(index="bra_id", columns="cnae_id", values="wage") rcas = growth.rca(ybi_data) rcas_binary = rcas.copy() rcas_binary[rcas_binary >= 1] = 1 rcas_binary[rcas_binary < 1] = 0 ''' DISTANCES ''' '''calculate proximity for opportunity gain calculation''' prox = growth.proximity(rcas_binary) '''calculate distances using proximity''' dist = growth.distance(rcas_binary, prox).fillna(0) ''' OPP GAIN ''' '''calculate product complexity''' pci = growth.complexity(rcas_binary)[1] '''calculate opportunity gain''' opp_gain = growth.opportunity_gain(rcas_binary, prox, pci) rdo = [] for bra in rcas.index: for cnae in rcas.columns: rdo.append([ year, bra, cnae, rcas[cnae][bra], dist[cnae][bra], opp_gain[cnae][bra] ]) rca_dist_opp += rdo # now time to merge! print "merging datasets..." ybi_rdo = pd.DataFrame( rca_dist_opp, columns=["year", "bra_id", "cnae_id", "rca", "distance", "opp_gain"]) ybi_rdo["year"] = ybi_rdo["year"].astype(int) ybi_rdo["rca"][ybi_rdo["rca"] == 0] = np.nan ybi_rdo = ybi_rdo.set_index(["year", "bra_id", "cnae_id"]) # get union of both sets of indexes all_ybi_indexes = set(ybi.index).union(set(ybi_rdo.index)) all_ybi_indexes = pd.MultiIndex.from_tuples( all_ybi_indexes, names=["year", "bra_id", "cnae_id"]) # ybi = ybi.reindex(index=all_ybi_indexes, fill_value=0) ybi = ybi.reindex(index=all_ybi_indexes) ybi["rca"] = ybi_rdo["rca"] ybi["distance"] = ybi_rdo["distance"] ybi["opp_gain"] = ybi_rdo["opp_gain"] return ybi
def rdo(ybi, yi, year, depths): rca_dist_opp = [] for geo_level in depths["bra"]: print "geo level:", geo_level ybi_data = ybi.reset_index() bra_criterion = ybi_data["bra_id"].map(lambda x: len(x) == geo_level) cnae_criterion = ybi_data["cnae_id"].map(lambda x: len(x) == 6) ybi_data = ybi_data[bra_criterion & cnae_criterion] # ybi_data = ybi_data.reindex(index=ybi_index) # ybi_data = ybi_data.drop(["year", "num_emp", "num_est", "wage_avg", "num_emp_est"], axis=1) ybi_data = ybi_data[["bra_id", "cnae_id", "wage"]] # ybi_data = ybi_data.unstack() # levels = ybi_data.columns.levels # labels = ybi_data.columns.labels # ybi_data.columns = levels[1][labels[1]] ''' RCAS ''' # ybi_data = ybi_data.pivot(index="bra_id", columns="cnae_id", values="wage").fillna(0) ybi_data = ybi_data.pivot(index="bra_id", columns="cnae_id", values="wage") rcas = growth.rca(ybi_data) rcas_binary = rcas.copy() rcas_binary[rcas_binary >= 1] = 1 rcas_binary[rcas_binary < 1] = 0 ''' DISTANCES ''' '''calculate proximity for opportunity gain calculation''' prox = growth.proximity(rcas_binary) '''calculate distances using proximity''' dist = growth.distance(rcas_binary, prox).fillna(0) ''' OPP GAIN ''' '''calculate product complexity''' pci = growth.complexity(rcas_binary)[1] '''calculate opportunity gain''' opp_gain = growth.opportunity_gain(rcas_binary, prox, pci) rdo = [] for bra in rcas.index: for cnae in rcas.columns: rdo.append([year, bra, cnae, rcas[cnae][bra], dist[cnae][bra], opp_gain[cnae][bra]]) rca_dist_opp += rdo # now time to merge! print "merging datasets..." ybi_rdo = pd.DataFrame(rca_dist_opp, columns=["year", "bra_id", "cnae_id", "rca", "distance", "opp_gain"]) ybi_rdo["year"] = ybi_rdo["year"].astype(int) ybi_rdo["rca"][ybi_rdo["rca"] == 0] = np.nan ybi_rdo = ybi_rdo.set_index(["year", "bra_id", "cnae_id"]) # get union of both sets of indexes all_ybi_indexes = set(ybi.index).union(set(ybi_rdo.index)) all_ybi_indexes = pd.MultiIndex.from_tuples(all_ybi_indexes, names=["year", "bra_id", "cnae_id"]) # ybi = ybi.reindex(index=all_ybi_indexes, fill_value=0) ybi = ybi.reindex(index=all_ybi_indexes) ybi["rca"] = ybi_rdo["rca"] ybi["distance"] = ybi_rdo["distance"] ybi["opp_gain"] = ybi_rdo["opp_gain"] return ybi
# print df2.head() # totals = pd.read_sql("select course_hedu_id, sum(enrolled) as enrolled from hedu_ybuc where year = 2012 group by year, course_hedu_id", db, index_col="course_hedu_id") # print totals.head() # course_hedu_id = "811G01" # df = pd.read_sql("select year, bra_id, name_en, sum(enrolled) as enrolled from hedu_ybuc, attrs_bra where year = 2012 and course_hedu_id = %s and id = bra_id group by year, bra_id", db, params=[course_hedu_id], index_col="bra_id") # df["share"] = df["enrolled"] / munic_totals["enrolled"] # print df.sort("share", ascending=False).head() rcas = pd.read_sql( "select bra_id, course_hedu_id, enrolled from hedu_ybuc where year = 2012 group by year, bra_id, course_hedu_id", db) rcas = rcas.pivot(index="bra_id", columns="course_hedu_id", values="enrolled") # rcas = rcas.pivot(index="course_hedu_id", columns="bra_id", values="enrolled") rcas = growth.rca(rcas) rcas = pd.DataFrame(rcas.stack(), columns=["enrolled_rca"]) course_names = pd.read_sql( "select id as course_hedu_id, name_en from attrs_course_hedu", db, index_col="course_hedu_id") # print course_names.head() # print rcas.ix["212M02"].sort('enrolled_rca', ascending=False).head(10) # sys.exit() munic = rcas.ix["sp120509"] munic["course_name"] = course_names["name_en"] print munic.sort("enrolled_rca", ascending=False).head(10)