def prox(year, output_path, attr, i_attr, table, column): attr_depths = depths[attr] i_attr_depths = depths[i_attr] years = get_years(year) for year in years: print "year:", year for i, depth in enumerate(attr_depths): print attr, "depth:", depth query = """ SELECT {0}_id, {1}_id, {2} FROM {3} WHERE year=%s """.format(attr, i_attr, column, table) if len(attr_depths) > 1: query += " and {}_id_len={}".format(attr, depth) if len(i_attr_depths) > 1: query += " and {}_id_len={}".format(i_attr, i_attr_depths[-1]) if "secex" in table: query += " and month=0" data = sql.read_sql(query, db, params=[year]) data = data.pivot(index="{}_id".format(i_attr), columns="{}_id".format(attr), values=column) rcas = ps_calcs.rca(data) rcas[rcas >= 1] = 1 rcas[rcas < 1] = 0 prox = ps_calcs.proximity(rcas) prox = pd.DataFrame(prox.unstack(), columns=["{}_proximity".format(i_attr)]) prox["year"] = year prox = prox.set_index("year", append=True) output_path_w_year = os.path.abspath( os.path.join(output_path, str(year))) if not os.path.exists(output_path_w_year): os.makedirs(output_path_w_year) fp = os.path.join(output_path_w_year, "{}_{}_proximity.tsv".format(attr, i_attr)) file_mode = 'a' if i else 'w' user_header = False if i else True with open(fp, file_mode) as f: prox.to_csv(f, header=user_header, sep="\t")
def prox(year, output_path, attr, i_attr, table, column): attr_depths = depths[attr] i_attr_depths = depths[i_attr] years = get_years(year) for year in years: print "year:", year for i, depth in enumerate(attr_depths): print attr, "depth:", depth query = """ SELECT {0}_id, {1}_id, {2} FROM {3} WHERE year=%s """.format(attr, i_attr, column, table) if len(attr_depths) > 1: query += " and {}_id_len={}".format(attr, depth) if len(i_attr_depths) > 1: query += " and {}_id_len={}".format(i_attr, i_attr_depths[-1]) if "secex" in table: query += " and month=0" data = sql.read_sql(query, db, params=[year]) data = data.pivot(index="{}_id".format(i_attr), columns="{}_id".format(attr), values=column) rcas = ps_calcs.rca(data) rcas[rcas >= 1] = 1 rcas[rcas < 1] = 0 prox = ps_calcs.proximity(rcas) prox = pd.DataFrame(prox.unstack(), columns=["{}_proximity".format(i_attr)]) prox["year"] = year prox = prox.set_index("year", append=True) output_path_w_year = os.path.abspath(os.path.join(output_path, str(year))) if not os.path.exists(output_path_w_year): os.makedirs(output_path_w_year) fp = os.path.join(output_path_w_year, "{}_{}_proximity.tsv".format(attr, i_attr)) file_mode = 'a' if i else 'w' user_header = False if i else True with open(fp, file_mode) as f: prox.to_csv(f, header=user_header, sep="\t")
def get_wld_proximity(year, ypw_file_path): '''Get world values from ypw file''' table = pd.read_csv(ypw_file_path, compression="bz2", sep="\t", converters={"hs_id":str}) table = table.rename(columns={"val_usd":"export_val"}) table = table.pivot(index="wld_id", columns="hs_id", values="export_val").fillna(0) '''Use growth library to run RCA calculation on data''' mcp = ps_calcs.rca(table) mcp[mcp >= 1] = 1 mcp[mcp < 1] = 0 prox = ps_calcs.proximity(mcp) return prox
def get_wld_proximity(year, ypw_file_path): '''Get world values from ypw file''' table = pd.read_csv(ypw_file_path, compression="bz2", sep="\t", converters={"hs_id": str}) table = table.rename(columns={"val_usd": "export_val"}) table = table.pivot(index="wld_id", columns="hs_id", values="export_val").fillna(0) '''Use growth library to run RCA calculation on data''' mcp = ps_calcs.rca(table) mcp[mcp >= 1] = 1 mcp[mcp < 1] = 0 prox = ps_calcs.proximity(mcp) return prox
def get_wld_proximity(year, ypw_file_path): ''' Connect to DB ''' db = MySQLdb.connect(host=os.environ.get("DATAVIVA_DB_HOST", "localhost"), user=os.environ.get("DATAVIVA_DB_USER", "root"), passwd=os.environ.get("DATAVIVA_DB_PW", ""), db=os.environ.get("DATAVIVA_DB_NAME", "dataviva")) '''Get world values from ypw file''' table = pd.read_csv(ypw_file_path, compression="bz2", sep="\t", converters={"hs_id":str}) table = table.rename(columns={"val_usd":"export_val"}) table = table.pivot(index="wld_id", columns="hs_id", values="export_val").fillna(0) '''Use growth library to run RCA calculation on data''' mcp = ps_calcs.rca(table) mcp[mcp >= 1] = 1 mcp[mcp < 1] = 0 prox = ps_calcs.proximity(mcp) return prox
def get_wld_proximity(year, ypw_file_path): ''' Connect to DB ''' db = MySQLdb.connect(host=os.environ.get("DATAVIVA_DB_HOST", "localhost"), user=os.environ.get("DATAVIVA_DB_USER", "root"), passwd=os.environ.get("DATAVIVA_DB_PW", ""), db=os.environ.get("DATAVIVA_DB_NAME", "dataviva")) '''Get world values from ypw file''' table = pd.read_csv(ypw_file_path, compression="bz2", sep="\t", converters={"hs_id": str}) table = table.rename(columns={"val_usd": "export_val"}) table = table.pivot(index="wld_id", columns="hs_id", values="export_val").fillna(0) '''Use growth library to run RCA calculation on data''' mcp = ps_calcs.rca(table) mcp[mcp >= 1] = 1 mcp[mcp < 1] = 0 prox = ps_calcs.proximity(mcp) return prox
def get_wld_proximity(year): ''' Connect to DB ''' db = MySQLdb.connect(host=os.environ["DATAVIVA_DB_HOST"], user=os.environ["DATAVIVA_DB_USER"], passwd=os.environ["DATAVIVA_DB_PW"], db=os.environ["DATAVIVA_DB_NAME"]) '''Get values from database''' q = "select wld_id, hs_id, val_usd " \ "from comtrade_ypw " \ "where year = {0} and length(hs_id) = 6".format(year) table = sql.read_sql(q, db) table = table.rename(columns={"val_usd": "val_usd"}) table = table.pivot(index="wld_id", columns="hs_id", values="val_usd") table = table.fillna(0) '''Use growth library to run RCA calculation on data''' mcp = ps_calcs.rca(table) mcp[mcp >= 1] = 1 mcp[mcp < 1] = 0 prox = ps_calcs.proximity(mcp) return prox
def get_wld_proximity(year): ''' Connect to DB ''' db = MySQLdb.connect(host=os.environ["DATAVIVA_DB_HOST"], user=os.environ["DATAVIVA_DB_USER"], passwd=os.environ["DATAVIVA_DB_PW"], db=os.environ["DATAVIVA_DB_NAME"]) '''Get values from database''' q = "select wld_id, hs_id, val_usd " \ "from comtrade_ypw " \ "where year = {0} and length(hs_id) = 6".format(year) table = sql.read_sql(q, db) table = table.rename(columns={"val_usd":"val_usd"}) table = table.pivot(index="wld_id", columns="hs_id", values="val_usd") table = table.fillna(0) '''Use growth library to run RCA calculation on data''' mcp = ps_calcs.rca(table) mcp[mcp >= 1] = 1 mcp[mcp < 1] = 0 prox = ps_calcs.proximity(mcp) return prox
def rdo(ybp, yp, year, depths): hs = yp[["val_usd"]].groupby(level=["hs_id"]).sum().dropna() hs = [h for h in hs.index if len(h) == depths["hs"][-1]] rca_dist_opp = [] for geo_level in depths["bra"]: print "geo_level", geo_level ''' RCAS ''' rcas_dom = get_domestic_rcas(geo_level, year, ybp, depths) rcas_dom = rcas_dom.reindex(columns=hs) rcas_wld = get_wld_rcas(geo_level, year, ybp, depths) rcas_wld = rcas_wld.reindex(columns=hs) rcas_dom_binary = rcas_dom.copy() rcas_dom_binary[rcas_dom_binary >= 1] = 1 rcas_dom_binary[rcas_dom_binary < 1] = 0 rcas_wld_binary = rcas_wld.copy() rcas_wld_binary[rcas_wld_binary >= 1] = 1 rcas_wld_binary[rcas_wld_binary < 1] = 0 ''' DISTANCES ''' '''domestic distances''' prox_dom = ps_calcs.proximity(rcas_dom_binary) dist_dom = ps_calcs.distance(rcas_dom_binary, prox_dom).fillna(0) '''world distances''' prox_wld = get_wld_proximity(year) hs_wld = set(rcas_wld_binary.columns).intersection( set(prox_wld.columns)) # hs_wld = set(rcas_wld_binary.columns).union(set(prox_wld.columns)) prox_wld = prox_wld.reindex(columns=hs_wld, index=hs_wld) rcas_wld_binary = rcas_wld_binary.reindex(columns=hs_wld) dist_wld = ps_calcs.distance(rcas_wld_binary, prox_wld).fillna(0) ''' OPP GAIN ''' '''same PCIs for all since we are using world PCIs''' pcis = get_pcis(geo_level, yp, depths) # all_hs_dom = set(pcis.index).union(set(rcas_dom.columns)) all_hs_dom = set(pcis.index).intersection(set(rcas_dom.columns)) pcis_dom = pcis.reindex(index=all_hs_dom) rcas_dom_binary = rcas_dom_binary.reindex(columns=all_hs_dom) prox_dom = prox_dom.reindex(index=all_hs_dom, columns=all_hs_dom) # print rcas_dom_binary.shape, prox_dom.shape, pcis.shape # all_hs_wld = set(pcis.index).union(set(rcas_wld.columns)) all_hs_wld = set(pcis.index).intersection(set(rcas_wld.columns)) pcis_wld = pcis.reindex(index=all_hs_wld) rcas_wld_binary = rcas_wld_binary.reindex(columns=all_hs_wld) prox_wld = prox_wld.reindex(index=all_hs_wld, columns=all_hs_wld) # print rcas_dom_binary.shape, prox_dom.shape, pcis.shape opp_gain_wld = ps_calcs.opportunity_gain(rcas_wld_binary, prox_wld, pcis_wld) opp_gain_dom = ps_calcs.opportunity_gain(rcas_dom_binary, prox_dom, pcis_wld) ''' SET RCAS TO NULL ''' rcas_dom = rcas_dom.replace(0, np.nan) rcas_wld = rcas_wld.replace(0, np.nan) def tryto(df, col, ind): if col in df.columns: if ind in df.index: return df[col][ind] return None # print opp_gain_wld.ix["al000107"].ix["041601"] # print opp_gain_dom.ix["al000107"].ix["041601"] # print tryto(opp_gain_dom, "041601", "al000107") # print "al000107" in set(rcas_dom.index).union(set(rcas_wld.index)) # print "041601" in set(export_hs).union(set(import_hs)) # sys.exit() for bra in set(rcas_dom.index).union(set(rcas_wld.index)): for h in hs: rca_dist_opp.append([year, bra, h, \ tryto(rcas_dom, h, bra), tryto(rcas_wld, h, bra), \ tryto(dist_dom, h, bra), tryto(dist_wld, h, bra), \ tryto(opp_gain_dom, h, bra), tryto(opp_gain_wld, h, bra) ]) # print len(rca_dist_opp), "rows updated" # now time to merge! # print "merging datasets..." ybp_rdo = pd.DataFrame(rca_dist_opp, columns=[ "year", "bra_id", "hs_id", "rca", "rca_wld", "distance", "distance_wld", "opp_gain", "opp_gain_wld" ]) ybp_rdo["year"] = ybp_rdo["year"].astype("int") ybp_rdo = ybp_rdo.set_index(["year", "bra_id", "hs_id"]) ybp = pd.merge(ybp, ybp_rdo, how="outer", left_index=True, right_index=True) return ybp
def rdo(ybi, yi, year, depths): rca_dist_opp = [] for geo_level in depths["bra"]: print "geo level:", geo_level ybi_data = ybi.reset_index() bra_criterion = ybi_data["bra_id"].str.len() == geo_level cnae_criterion = ybi_data["cnae_id"].str.len() == 6 ybi_data = ybi_data[bra_criterion & cnae_criterion] # ybi_data = ybi_data.reindex(index=ybi_index) # ybi_data = ybi_data.drop(["year", "num_emp", "num_est", "wage_avg", "num_emp_est"], axis=1) ybi_data = ybi_data[["bra_id", "cnae_id", "wage"]] # ybi_data = ybi_data.unstack() # levels = ybi_data.columns.levels # labels = ybi_data.columns.labels # ybi_data.columns = levels[1][labels[1]] ''' RCAS ''' # ybi_data = ybi_data.pivot(index="bra_id", columns="cnae_id", values="wage").fillna(0) ybi_data = ybi_data.pivot(index="bra_id", columns="cnae_id", values="wage") rcas = ps_calcs.rca(ybi_data) rcas_binary = rcas.copy() rcas_binary[rcas_binary >= 1] = 1 rcas_binary[rcas_binary < 1] = 0 ''' DISTANCES ''' '''calculate proximity for opportunity gain calculation''' prox = ps_calcs.proximity(rcas_binary) '''calculate distances using proximity''' dist = ps_calcs.distance(rcas_binary, prox).fillna(0) ''' OPP GAIN ''' '''calculate product complexity''' pci = ps_calcs.complexity(rcas_binary)[1] '''calculate opportunity gain''' opp_gain = ps_calcs.opportunity_gain(rcas_binary, prox, pci) rdo = [] for bra in rcas.index: for cnae in rcas.columns: rdo.append([year, bra, cnae, rcas[cnae][bra], dist[cnae][bra], opp_gain[cnae][bra]]) rca_dist_opp += rdo # now time to merge! print "merging datasets..." ybi_rdo = pd.DataFrame(rca_dist_opp, columns=["year", "bra_id", "cnae_id", "rca", "distance", "opp_gain"]) ybi_rdo["year"] = ybi_rdo["year"].astype(int) ybi_rdo["rca"][ybi_rdo["rca"] == 0] = np.nan ybi_rdo = ybi_rdo.set_index(["year", "bra_id", "cnae_id"]) # get union of both sets of indexes all_ybi_indexes = set(ybi.index).union(set(ybi_rdo.index)) all_ybi_indexes = pd.MultiIndex.from_tuples(all_ybi_indexes, names=["year", "bra_id", "cnae_id"]) # ybi = ybi.reindex(index=all_ybi_indexes, fill_value=0) ybi = ybi.reindex(index=all_ybi_indexes) ybi["rca"] = ybi_rdo["rca"] ybi["distance"] = ybi_rdo["distance"] ybi["opp_gain"] = ybi_rdo["opp_gain"] return ybi
def rdo(ymbp, ymp, year, geo_depths, ypw_file_path): export_hs = ymp[["export_val"]].groupby(level=["hs_id"]).sum().dropna() export_hs = [hs for hs in export_hs.index if len(hs) == 6] import_hs = ymp[["import_val"]].groupby(level=["hs_id"]).sum().dropna() import_hs = [hs for hs in import_hs.index if len(hs) == 6] rca_dist_opp = [] for geo_level in geo_depths: print "geo_level",geo_level ''' RCAS ''' rcas_dom = get_domestic_rcas(geo_level, year, ymbp, "export") rcas_dom = rcas_dom.reindex(columns=export_hs) rcd = get_domestic_rcas(geo_level, year, ymbp, "import") rcd = rcd.reindex(columns=import_hs) # print rcd.ix["mg"] # sys.exit() rcas_wld = get_wld_rcas(geo_level, year, ymbp, ypw_file_path) rcas_wld = rcas_wld.reindex(columns=export_hs) # print rcas_wld.ix["mg"] # print rcas_wld['010204'] # sys.exit() rcas_dom_binary = rcas_dom.copy() rcas_dom_binary[rcas_dom_binary >= 1] = 1 rcas_dom_binary[rcas_dom_binary < 1] = 0 rcas_wld_binary = rcas_wld.copy() rcas_wld_binary[rcas_wld_binary >= 1] = 1 rcas_wld_binary[rcas_wld_binary < 1] = 0 ''' DISTANCES ''' '''domestic distances''' prox_dom = ps_calcs.proximity(rcas_dom_binary) dist_dom = ps_calcs.distance(rcas_dom_binary, prox_dom).fillna(0) '''world distances''' prox_wld = get_wld_proximity(year, ypw_file_path) hs_wld = set(rcas_wld_binary.columns).intersection(set(prox_wld.columns)) # hs_wld = set(rcas_wld_binary.columns).union(set(prox_wld.columns)) prox_wld = prox_wld.reindex(columns=hs_wld, index=hs_wld) rcas_wld_binary = rcas_wld_binary.reindex(columns=hs_wld) dist_wld = ps_calcs.distance(rcas_wld_binary, prox_wld).fillna(0) ''' OPP GAIN ''' '''same PCIs for all since we are using world PCIs''' pcis = get_pcis(geo_level, ymp) # all_hs_dom = set(pcis.index).union(set(rcas_dom.columns)) all_hs_dom = set(pcis.index).intersection(set(rcas_dom.columns)) pcis_dom = pcis.reindex(index=all_hs_dom) rcas_dom_binary = rcas_dom_binary.reindex(columns = all_hs_dom) prox_dom = prox_dom.reindex(index = all_hs_dom, columns = all_hs_dom) # print rcas_dom_binary.shape, prox_dom.shape, pcis.shape # all_hs_wld = set(pcis.index).union(set(rcas_wld.columns)) all_hs_wld = set(pcis.index).intersection(set(rcas_wld.columns)) pcis_wld = pcis.reindex(index=all_hs_wld) rcas_wld_binary = rcas_wld_binary.reindex(columns = all_hs_wld) prox_wld = prox_wld.reindex(index = all_hs_wld, columns = all_hs_wld) # print rcas_dom_binary.shape, prox_dom.shape, pcis.shape opp_gain_wld = ps_calcs.opportunity_gain(rcas_wld_binary, prox_wld, pcis_wld) opp_gain_dom = ps_calcs.opportunity_gain(rcas_dom_binary, prox_dom, pcis_wld) ''' SET RCAS TO NULL ''' rcas_dom = rcas_dom.replace(0, np.nan) rcas_wld = rcas_wld.replace(0, np.nan) rcd = rcd.replace(0, np.nan) def tryto(df, col, ind): if col in df.columns: if ind in df.index: return df[col][ind] return None # print opp_gain_wld.ix["al000107"].ix["041601"] # print opp_gain_dom.ix["al000107"].ix["041601"] # print tryto(opp_gain_dom, "041601", "al000107") # print "al000107" in set(rcas_dom.index).union(set(rcas_wld.index)) # print "041601" in set(export_hs).union(set(import_hs)) # sys.exit() for bra in set(rcas_dom.index).union(set(rcas_wld.index)): for hs in set(export_hs).union(set(import_hs)): rca_dist_opp.append([year, bra, hs, \ tryto(rcas_dom, hs, bra), tryto(rcas_wld, hs, bra), \ tryto(rcd, hs, bra), \ tryto(dist_dom, hs, bra), tryto(dist_wld, hs, bra), \ tryto(opp_gain_dom, hs, bra), tryto(opp_gain_wld, hs, bra) ]) # print len(rca_dist_opp), "rows updated" # now time to merge! # print "merging datasets..." ybp_rdo = pd.DataFrame(rca_dist_opp, columns=["year", "bra_id", "hs_id", "rca", "rca_wld", "rcd", "distance", "distance_wld", "opp_gain", "opp_gain_wld"]) ybp_rdo["year"] = ybp_rdo["year"].astype("int") ybp_rdo["month"] = "00" ybp_rdo = ybp_rdo.set_index(["year", "month", "bra_id", "hs_id"]) ymbp = pd.merge(ymbp, ybp_rdo, how="outer", left_index=True, right_index=True) return ymbp
def main(input_file, year, output_dir): output_dir = os.path.abspath(os.path.join(output_dir, str(year))) if not os.path.exists(output_dir): os.makedirs(output_dir) store = pd.HDFStore(os.path.join(output_dir,'yodp.h5')) try: ypw = store.get('ypw') except KeyError: ''' Import file to pandas dataframe ''' comtrade_df = import_file(input_file) ''' Add indexes ''' ypw = comtrade_df.groupby(['hs_id', 'wld_id']).sum() store.put('ypw', ypw) ''' Calculate RCA ''' ypw_rca = ypw.reset_index() ypw_rca = ypw_rca.pivot(index="wld_id", columns="hs_id", values="val_usd") ypw_rca = ps_calcs.rca(ypw_rca) ypw_rca_binary = ypw_rca.copy() ypw_rca_binary[ypw_rca_binary >= 1] = 1 ypw_rca_binary[ypw_rca_binary < 1] = 0 ''' DISTANCES ''' ypw_prox = ps_calcs.proximity(ypw_rca_binary) ypw_dist = ps_calcs.distance(ypw_rca_binary, ypw_prox).fillna(0) ''' COMPLEXITY ''' eci, pci = calc_complexity(ypw) ''' OPP GAIN ''' ypw_opp_gain = ps_calcs.opportunity_gain(ypw_rca_binary[pci.index], ypw_prox[pci.index].reindex(pci.index), pci) ''' MERGE DATA ''' ypw_opp_gain = pd.DataFrame(ypw_opp_gain.T.stack(), columns=["opp_gain"]) ypw_opp_gain = ypw_opp_gain.replace(0, np.nan) ypw_dist = pd.DataFrame(ypw_dist.T.stack(), columns=["distance"]) ypw_dist = ypw_dist.replace(0, np.nan) ypw_rca = pd.DataFrame(ypw_rca.T.stack(), columns=["rca"]) ypw_rca = ypw_rca.replace(0, np.nan) new_ypw = ypw \ .merge(ypw_rca, how="outer", left_index=True, right_index=True) \ .merge(ypw_dist, how="outer", left_index=True, right_index=True) \ .merge(ypw_opp_gain, how="outer", left_index=True, right_index=True) new_ypw = new_ypw.reset_index() new_ypw["year"] = year cols = new_ypw.columns.tolist() cols = cols[-1:] + cols[:-1] new_ypw = new_ypw[cols] ''' Write out to files ''' new_file_path = os.path.abspath(os.path.join(output_dir, "comtrade_ypw.tsv.bz2")) new_ypw.to_csv(bz2.BZ2File(new_file_path, 'wb'), sep="\t", index=False, float_format="%.3f") new_file_path = os.path.abspath(os.path.join(output_dir, "comtrade_pci.tsv.bz2")) pd.DataFrame(pci, columns=["pci"]).to_csv(bz2.BZ2File(new_file_path, 'wb'), sep="\t", index=True, float_format="%.3f") new_file_path = os.path.abspath(os.path.join(output_dir, "comtrade_eci.tsv.bz2")) pd.DataFrame(eci, columns=["eci"]).to_csv(bz2.BZ2File(new_file_path, 'wb'), sep="\t", index=True, float_format="%.3f")
def main(): ''' Step 1: Import the data file to a pandas DataFrame. ''' try: oec_df = pd.read_csv("data/year_origin_hs92_4.tsv", \ sep="\t", \ converters={"hs92":str}) except IOError: sys.exit("File doesn't exist, use fetch_oec_data.sh to download.") ''' Step 2: Convert our vertically oriented data CPY (country-product-year) into the multidimensional Mcp matrix. rows = countries columns = products ''' # Only use most recent year (could loop through each year too...) most_recent_year = sorted(oec_df.year.unique())[-1] oec_df = oec_df[oec_df.year == most_recent_year] # We only care about the country, product and export_val columns # so let's drop all the others oec_df = oec_df[["origin", "hs92", "export_val"]] # Drop all rows without export value oec_df = oec_df[~oec_df.export_val.isnull()] # Now we pivot our flat file to be countries X products mcp = oec_df.pivot(index="origin", columns="hs92", values="export_val") ''' Step 3: Now this is the easiest part, we use the ps_calcs library to run the RCA calculation on the Mcp matrix. ''' rcas = rca(mcp) # Here are some tests... # 1. Print the 10 products New Zealand (nzl) has the highest RCA in. # 0204 = Sheep and Goat Meat print(rcas.ix['nzl'].sort_values(ascending=False).head(10)) # Here are some tests... # 1. Print the 10 countries with the highest RCA in cars (8703). # SVK = Slovakia print(rcas['8703'].sort_values(ascending=False).head(10)) ''' Step 4: Lastly, we can convert our nominal RCA values into binary 1s and 0s, 1 and > meaning that countries exports their fair share of the product and 0 meaning they don't. ''' rcas[rcas >= 1] = 1 rcas[rcas < 1] = 0 proximities = proximity(rcas) densities = density(rcas, proximities) print("\nThe top 10 HS product codes that Brazil has RCA in:\n") print(rcas.loc["bra"].sort_values(ascending=False).head(10)) print("\n Rcas") print(rcas) print("\n Proximities") print(proximities) print("\n Densities") print(densities) print( "\nCalculation run successfully! Read the source code to see what's going on." )
def rdo(ybi, yi, year, depths): rca_dist_opp = [] for geo_level in depths["bra"]: print "geo level:", geo_level ybi_data = ybi.reset_index() bra_criterion = ybi_data["bra_id"].str.len() == geo_level cnae_criterion = ybi_data["cnae_id"].str.len() == 6 ybi_data = ybi_data[bra_criterion & cnae_criterion] # ybi_data = ybi_data.reindex(index=ybi_index) # ybi_data = ybi_data.drop(["year", "num_emp", "num_est", "wage_avg", "num_emp_est"], axis=1) ybi_data = ybi_data[["bra_id", "cnae_id", "wage"]] # ybi_data = ybi_data.unstack() # levels = ybi_data.columns.levels # labels = ybi_data.columns.labels # ybi_data.columns = levels[1][labels[1]] ''' RCAS ''' # ybi_data = ybi_data.pivot(index="bra_id", columns="cnae_id", values="wage").fillna(0) ybi_data = ybi_data.pivot(index="bra_id", columns="cnae_id", values="wage") rcas = ps_calcs.rca(ybi_data) rcas_binary = rcas.copy() rcas_binary[rcas_binary >= 1] = 1 rcas_binary[rcas_binary < 1] = 0 ''' DISTANCES ''' '''calculate proximity for opportunity gain calculation''' prox = ps_calcs.proximity(rcas_binary) '''calculate distances using proximity''' dist = ps_calcs.distance(rcas_binary, prox).fillna(0) ''' OPP GAIN ''' '''calculate product complexity''' pci = ps_calcs.complexity(rcas_binary)[1] '''calculate opportunity gain''' opp_gain = ps_calcs.opportunity_gain(rcas_binary, prox, pci) rdo = [] for bra in rcas.index: for cnae in rcas.columns: rdo.append([ year, bra, cnae, rcas[cnae][bra], dist[cnae][bra], opp_gain[cnae][bra] ]) rca_dist_opp += rdo # now time to merge! print "merging datasets..." ybi_rdo = pd.DataFrame( rca_dist_opp, columns=["year", "bra_id", "cnae_id", "rca", "distance", "opp_gain"]) ybi_rdo["year"] = ybi_rdo["year"].astype(int) ybi_rdo["rca"][ybi_rdo["rca"] == 0] = np.nan ybi_rdo = ybi_rdo.set_index(["year", "bra_id", "cnae_id"]) # get union of both sets of indexes all_ybi_indexes = set(ybi.index).union(set(ybi_rdo.index)) all_ybi_indexes = pd.MultiIndex.from_tuples( all_ybi_indexes, names=["year", "bra_id", "cnae_id"]) # ybi = ybi.reindex(index=all_ybi_indexes, fill_value=0) ybi = ybi.reindex(index=all_ybi_indexes) ybi["rca"] = ybi_rdo["rca"] ybi["distance"] = ybi_rdo["distance"] ybi["opp_gain"] = ybi_rdo["opp_gain"] return ybi
def rdo(ymbp, ymp, year, geo_depths): export_hs = ymp[["export_val"]].groupby(level=["hs_id"]).sum().dropna() export_hs = [hs for hs in export_hs.index if len(hs) == 6] import_hs = ymp[["import_val"]].groupby(level=["hs_id"]).sum().dropna() import_hs = [hs for hs in import_hs.index if len(hs) == 6] rca_dist_opp = [] for geo_level in geo_depths: print "geo_level", geo_level """ RCAS """ rcas_dom = get_domestic_rcas(geo_level, year, ymbp, "export") rcas_dom = rcas_dom.reindex(columns=export_hs) rcd = get_domestic_rcas(geo_level, year, ymbp, "import") rcd = rcd.reindex(columns=import_hs) # print rcd.ix["mg"] # sys.exit() rcas_wld = get_wld_rcas(geo_level, year, ymbp) rcas_wld = rcas_wld.reindex(columns=export_hs) # print rcas_wld.ix["4"] # print rcas_wld['010204'] # sys.exit() rcas_dom_binary = rcas_dom.copy() rcas_dom_binary[rcas_dom_binary >= 1] = 1 rcas_dom_binary[rcas_dom_binary < 1] = 0 rcas_wld_binary = rcas_wld.copy() rcas_wld_binary[rcas_wld_binary >= 1] = 1 rcas_wld_binary[rcas_wld_binary < 1] = 0 """ DISTANCES """ """domestic distances""" prox_dom = ps_calcs.proximity(rcas_dom_binary) dist_dom = ps_calcs.distance(rcas_dom_binary, prox_dom).fillna(0) """world distances""" prox_wld = get_wld_proximity(year) hs_wld = set(rcas_wld_binary.columns).intersection(set(prox_wld.columns)) # hs_wld = set(rcas_wld_binary.columns).union(set(prox_wld.columns)) prox_wld = prox_wld.reindex(columns=hs_wld, index=hs_wld) rcas_wld_binary = rcas_wld_binary.reindex(columns=hs_wld) dist_wld = ps_calcs.distance(rcas_wld_binary, prox_wld).fillna(0) """ OPP GAIN """ """same PCIs for all since we are using world PCIs""" pcis = get_pcis(geo_level, ymp) # all_hs_dom = set(pcis.index).union(set(rcas_dom.columns)) all_hs_dom = set(pcis.index).intersection(set(rcas_dom.columns)) pcis_dom = pcis.reindex(index=all_hs_dom) rcas_dom_binary = rcas_dom_binary.reindex(columns=all_hs_dom) prox_dom = prox_dom.reindex(index=all_hs_dom, columns=all_hs_dom) # print rcas_dom_binary.shape, prox_dom.shape, pcis.shape # all_hs_wld = set(pcis.index).union(set(rcas_wld.columns)) all_hs_wld = set(pcis.index).intersection(set(rcas_wld.columns)) pcis_wld = pcis.reindex(index=all_hs_wld) rcas_wld_binary = rcas_wld_binary.reindex(columns=all_hs_wld) prox_wld = prox_wld.reindex(index=all_hs_wld, columns=all_hs_wld) # print rcas_dom_binary.shape, prox_dom.shape, pcis.shape opp_gain_wld = ps_calcs.opportunity_gain(rcas_wld_binary, prox_wld, pcis_wld) opp_gain_dom = ps_calcs.opportunity_gain(rcas_dom_binary, prox_dom, pcis_wld) """ SET RCAS TO NULL """ rcas_dom = rcas_dom.replace(0, np.nan) rcas_wld = rcas_wld.replace(0, np.nan) rcd = rcd.replace(0, np.nan) def tryto(df, col, ind): if col in df.columns: if ind in df.index: return df[col][ind] return None # print opp_gain_wld.ix["al000107"].ix["041601"] # print opp_gain_dom.ix["al000107"].ix["041601"] # print tryto(opp_gain_dom, "041601", "al000107") # print "al000107" in set(rcas_dom.index).union(set(rcas_wld.index)) # print "041601" in set(export_hs).union(set(import_hs)) # sys.exit() """ Connect to DB """ db = MySQLdb.connect( host=os.environ.get("DATAVIVA_DB_HOST", "localhost"), user=os.environ["DATAVIVA_DB_USER"], passwd=os.environ["DATAVIVA_DB_PW"], db=os.environ["DATAVIVA_DB_NAME"], ) db.autocommit(1) cursor = db.cursor() for bra in set(rcas_dom.index).union(set(rcas_wld.index)): for hs in set(export_hs).union(set(import_hs)): cursor.execute( "update secex_ymbp set rca_wld=%s, opp_gain_wld=%s, distance_wld=%s where year=%s and month=0 and bra_id=%s and hs_id=%s;", [tryto(rcas_wld, hs, bra), tryto(opp_gain_wld, hs, bra), tryto(dist_wld, hs, bra), year, bra, hs], )
def main(input_file, year, output_dir): output_dir = os.path.abspath(os.path.join(output_dir, str(year))) if not os.path.exists(output_dir): os.makedirs(output_dir) store = pd.HDFStore(os.path.join(output_dir, 'yodp.h5')) try: ypw = store.get('ypw') except KeyError: ''' Import file to pandas dataframe ''' comtrade_df = import_file(input_file) ''' Add indexes ''' ypw = comtrade_df.groupby(['hs_id', 'wld_id']).sum() store.put('ypw', ypw) ''' Calculate RCA ''' ypw_rca = ypw.reset_index() ypw_rca = ypw_rca.pivot(index="wld_id", columns="hs_id", values="val_usd") ypw_rca = ps_calcs.rca(ypw_rca) ypw_rca_binary = ypw_rca.copy() ypw_rca_binary[ypw_rca_binary >= 1] = 1 ypw_rca_binary[ypw_rca_binary < 1] = 0 ''' DISTANCES ''' ypw_prox = ps_calcs.proximity(ypw_rca_binary) ypw_dist = ps_calcs.distance(ypw_rca_binary, ypw_prox).fillna(0) ''' COMPLEXITY ''' eci, pci = calc_complexity(ypw) ''' OPP GAIN ''' ypw_opp_gain = ps_calcs.opportunity_gain( ypw_rca_binary[pci.index], ypw_prox[pci.index].reindex(pci.index), pci) ''' MERGE DATA ''' ypw_opp_gain = pd.DataFrame(ypw_opp_gain.T.stack(), columns=["opp_gain"]) ypw_opp_gain = ypw_opp_gain.replace(0, np.nan) ypw_dist = pd.DataFrame(ypw_dist.T.stack(), columns=["distance"]) ypw_dist = ypw_dist.replace(0, np.nan) ypw_rca = pd.DataFrame(ypw_rca.T.stack(), columns=["rca"]) ypw_rca = ypw_rca.replace(0, np.nan) new_ypw = ypw \ .merge(ypw_rca, how="outer", left_index=True, right_index=True) \ .merge(ypw_dist, how="outer", left_index=True, right_index=True) \ .merge(ypw_opp_gain, how="outer", left_index=True, right_index=True) new_ypw = new_ypw.reset_index() new_ypw["year"] = year cols = new_ypw.columns.tolist() cols = cols[-1:] + cols[:-1] new_ypw = new_ypw[cols] ''' Write out to files ''' new_file_path = os.path.abspath( os.path.join(output_dir, "comtrade_ypw.tsv.bz2")) new_ypw.to_csv(bz2.BZ2File(new_file_path, 'wb'), sep="\t", index=False, float_format="%.3f") new_file_path = os.path.abspath( os.path.join(output_dir, "comtrade_pci.tsv.bz2")) pd.DataFrame(pci, columns=["pci"]).to_csv(bz2.BZ2File(new_file_path, 'wb'), sep="\t", index=True, float_format="%.3f") new_file_path = os.path.abspath( os.path.join(output_dir, "comtrade_eci.tsv.bz2")) pd.DataFrame(eci, columns=["eci"]).to_csv(bz2.BZ2File(new_file_path, 'wb'), sep="\t", index=True, float_format="%.3f")
def rdo(ymbp, ymp, year, geo_depths): export_hs = ymp[["export_val"]].groupby(level=["hs_id"]).sum().dropna() export_hs = [hs for hs in export_hs.index if len(hs) == 6] import_hs = ymp[["import_val"]].groupby(level=["hs_id"]).sum().dropna() import_hs = [hs for hs in import_hs.index if len(hs) == 6] rca_dist_opp = [] for geo_level in geo_depths: print "geo_level", geo_level ''' RCAS ''' rcas_dom = get_domestic_rcas(geo_level, year, ymbp, "export") rcas_dom = rcas_dom.reindex(columns=export_hs) rcd = get_domestic_rcas(geo_level, year, ymbp, "import") rcd = rcd.reindex(columns=import_hs) # print rcd.ix["mg"] # sys.exit() rcas_wld = get_wld_rcas(geo_level, year, ymbp) rcas_wld = rcas_wld.reindex(columns=export_hs) # print rcas_wld.ix["4"] # print rcas_wld['010204'] # sys.exit() rcas_dom_binary = rcas_dom.copy() rcas_dom_binary[rcas_dom_binary >= 1] = 1 rcas_dom_binary[rcas_dom_binary < 1] = 0 rcas_wld_binary = rcas_wld.copy() rcas_wld_binary[rcas_wld_binary >= 1] = 1 rcas_wld_binary[rcas_wld_binary < 1] = 0 ''' DISTANCES ''' '''domestic distances''' prox_dom = ps_calcs.proximity(rcas_dom_binary) dist_dom = ps_calcs.distance(rcas_dom_binary, prox_dom).fillna(0) '''world distances''' prox_wld = get_wld_proximity(year) hs_wld = set(rcas_wld_binary.columns).intersection( set(prox_wld.columns)) # hs_wld = set(rcas_wld_binary.columns).union(set(prox_wld.columns)) prox_wld = prox_wld.reindex(columns=hs_wld, index=hs_wld) rcas_wld_binary = rcas_wld_binary.reindex(columns=hs_wld) dist_wld = ps_calcs.distance(rcas_wld_binary, prox_wld).fillna(0) ''' OPP GAIN ''' '''same PCIs for all since we are using world PCIs''' pcis = get_pcis(geo_level, ymp) # all_hs_dom = set(pcis.index).union(set(rcas_dom.columns)) all_hs_dom = set(pcis.index).intersection(set(rcas_dom.columns)) pcis_dom = pcis.reindex(index=all_hs_dom) rcas_dom_binary = rcas_dom_binary.reindex(columns=all_hs_dom) prox_dom = prox_dom.reindex(index=all_hs_dom, columns=all_hs_dom) # print rcas_dom_binary.shape, prox_dom.shape, pcis.shape # all_hs_wld = set(pcis.index).union(set(rcas_wld.columns)) all_hs_wld = set(pcis.index).intersection(set(rcas_wld.columns)) pcis_wld = pcis.reindex(index=all_hs_wld) rcas_wld_binary = rcas_wld_binary.reindex(columns=all_hs_wld) prox_wld = prox_wld.reindex(index=all_hs_wld, columns=all_hs_wld) # print rcas_dom_binary.shape, prox_dom.shape, pcis.shape opp_gain_wld = ps_calcs.opportunity_gain(rcas_wld_binary, prox_wld, pcis_wld) opp_gain_dom = ps_calcs.opportunity_gain(rcas_dom_binary, prox_dom, pcis_wld) ''' SET RCAS TO NULL ''' rcas_dom = rcas_dom.replace(0, np.nan) rcas_wld = rcas_wld.replace(0, np.nan) rcd = rcd.replace(0, np.nan) def tryto(df, col, ind): if col in df.columns: if ind in df.index: return df[col][ind] return None # print opp_gain_wld.ix["al000107"].ix["041601"] # print opp_gain_dom.ix["al000107"].ix["041601"] # print tryto(opp_gain_dom, "041601", "al000107") # print "al000107" in set(rcas_dom.index).union(set(rcas_wld.index)) # print "041601" in set(export_hs).union(set(import_hs)) # sys.exit() ''' Connect to DB ''' db = MySQLdb.connect(host=os.environ.get("DATAVIVA_DB_HOST", "localhost"), user=os.environ["DATAVIVA_DB_USER"], passwd=os.environ["DATAVIVA_DB_PW"], db=os.environ["DATAVIVA_DB_NAME"]) db.autocommit(1) cursor = db.cursor() for bra in set(rcas_dom.index).union(set(rcas_wld.index)): for hs in set(export_hs).union(set(import_hs)): cursor.execute( "update secex_ymbp set rca_wld=%s, opp_gain_wld=%s, distance_wld=%s where year=%s and month=0 and bra_id=%s and hs_id=%s;", [ tryto(rcas_wld, hs, bra), tryto(opp_gain_wld, hs, bra), tryto(dist_wld, hs, bra), year, bra, hs ])