def main(file_path, year, output_path, prev_path, prev5_path): start = time.time() step = 0 geo_depths = [2, 4, 6, 7, 8] # state, meso, micro, planning region, munic d = pd.HDFStore(os.path.abspath(os.path.join(output_path,'rais.h5'))) step+=1; print; print '''STEP {0}: \nImport file to pandas dataframe'''.format(step) rais_df = to_df(file_path, False, calc_d_id=True) step+=1; print; print '''STEP {0}: \nAggregate with Demographics'''.format(step) tables = aggregate_demographics(rais_df, geo_depths) for table_name, table_data in tables.items(): table_data = add_column_length(table_name, table_data) # print table_data.head() if prev_path: step+=1; print; print '''STEP {0}: \nCalculate 1 year growth'''.format(step) for t_name, t in tables.items(): prev_file = os.path.join(prev_path, "{0}.tsv.bz2".format(t_name)) t_prev = to_df(prev_file, t_name, calc_d_id=True) t_prev = t_prev.reset_index(level="year") t_prev["year"] = int(year) t_prev = t_prev.set_index("year", append=True) t_prev = t_prev.reorder_levels(["year"] + list(t_prev.index.names)[:-1]) tables[t_name] = calc_growth(t, t_prev) print tables[t_name].head() # sys.exit() if prev5_path: step+=1; print; print '''STEP {0}: \nCalculate 5 year growth'''.format(step) prev_file = os.path.join(prev5_path, "{0}.tsv.bz2".format(t_name)) t_prev = to_df(prev_file, t_name, calc_d_id=True) t_prev = t_prev.reset_index(level="year") t_prev["year"] = int(year) t_prev = t_prev.set_index("year", append=True) t_prev = t_prev.reorder_levels(["year"] + list(t_prev.index.names)[:-1]) # t_prev = to_df(prev_file, t_name) tables[t_name] = calc_growth(tables[t_name], t_prev, 5) print; print '''FINAL STEP: \nSave files to output path''' for t_name, t in tables.items(): if not os.path.exists(output_path): os.makedirs(output_path) new_file_path = os.path.abspath(os.path.join(output_path, "{0}.tsv.bz2".format(t_name))) t.to_csv(bz2.BZ2File(new_file_path, 'wb'), sep="\t", index=True, float_format="%.2f") print("--- %s minutes ---" % str((time.time() - start)/60))
def main(cur_path, year, output_path, prev_path, prev5_path, demographic_mode): start = time.time() step = 0 tables_reg = ["yb", "yi", "yo", "ybi", "ybo", "yio", "ybio"] if demographic_mode: tables_reg = ["ybd", "yid", "yod", "ybid", "ybod" ] for t_name in tables_reg: cur_file = os.path.join(cur_path, "{0}.tsv.bz2".format(t_name)) print cur_file t = to_df(cur_file, t_name, calc_d_id=demographic_mode) step+=1; print; print '''STEP {0}: \nCalculate 1 year growth'''.format(step) prev_file = os.path.join(prev_path, "{0}.tsv.bz2".format(t_name)) t_prev = to_df(prev_file, t_name, calc_d_id=demographic_mode) t_prev = t_prev.reset_index(level="year") t_prev["year"] = int(year) t_prev = t_prev.set_index("year", append=True) t_prev = t_prev.reorder_levels(["year"] + list(t_prev.index.names)[:-1]) t = calc_growth(t, t_prev) print t.head() if prev5_path: step+=1; print; print '''STEP {0}: \nCalculate 5 year growth'''.format(step) prev_file = os.path.join(prev5_path, "{0}.tsv.bz2".format(t_name)) t_prev = to_df(prev_file, t_name, calc_d_id=demographic_mode) t_prev = t_prev.reset_index(level="year") t_prev["year"] = int(year) t_prev = t_prev.set_index("year", append=True) t_prev = t_prev.reorder_levels(["year"] + list(t_prev.index.names)[:-1]) print t_prev.head() t = calc_growth(t, t_prev, 5) print t.head() if not os.path.exists(output_path): os.makedirs(output_path) print "Saving!" new_file_path = os.path.abspath(os.path.join(output_path, "{0}_with_growth.tsv.bz2".format(t_name))) t.to_csv(bz2.BZ2File(new_file_path, 'wb'), sep="\t", index=True, float_format="%.2f") print("--- %s minutes ---" % str((time.time() - start)/60))
def calc_growth(year, current_year_file_path, prev_year_file_path, prev5_year_file_path): tbl_name = extract_tbl_name(current_year_file_path) current_year_tbl = to_df(current_year_file_path, tbl_name) for prev_path, years_ago in [(prev_year_file_path, 1), (prev5_year_file_path, 5)]: if not prev_path: continue prev_year_tbl = to_df(prev_path, tbl_name) prev_year_tbl = prev_year_tbl.reset_index(level="year") prev_year_tbl["year"] = int(year) prev_year_tbl = prev_year_tbl.set_index("year", append=True) prev_year_tbl = prev_year_tbl.reorder_levels(["year"] + list(prev_year_tbl.index.names)[:-1]) current_year_tbl = do_growth(current_year_tbl, prev_year_tbl, years_ago) return (tbl_name, current_year_tbl)
def main(file_path, year, output_path, prev_path, prev5_path): print "\nHEDU YEAR: {0}\n".format(year) pre_check() output_path = os.path.join(output_path, str(year)) if not os.path.exists(output_path): os.makedirs(output_path) hdf_store = pd.HDFStore(os.path.abspath(os.path.join(output_path, 'hedu_data.h5'))) print '''\nImport file to pandas dataframe''' if "hedu_df" in hdf_store: hedu_df = hdf_store['hedu_df'] else: hedu_df = to_df(file_path, year) try: hdf_store['hedu_df'] = hedu_df except OverflowError: print "WARNING: Unable to save dataframe, Overflow Error." hdf_store.close() os.remove(os.path.join(output_path, 'hedu_data.h5')) tables_list = ["yb", "yu", "yc", "ybc", "ybu", "yuc", "ybuc"] index_lookup = {"y": "year", "b": "bra_id", "c": "course_hedu_id", "u": "university_id"} for table_name in tables_list: indexes = [index_lookup[l] for l in table_name] print '''\nAggregating {0}'''.format(table_name) aggregated_df = aggregate(indexes, hedu_df) print '''Adding length column to {0}'''.format(table_name) aggregated_df = add_column_length(table_name, aggregated_df) print '''Renaming {0} columns'''.format(table_name) aggregated_df.rename(columns={"student_id": "students"}, inplace=True) if 'u' not in table_name: aggregated_df.rename(columns={"university_id": "num_universities"}, inplace=True) if prev_path: print '''\nCalculating {0} 1 year growth'''.format(table_name) previous_df = open_prev_df(prev_path, table_name, year, indexes) aggregated_df = calc_growth(aggregated_df, previous_df, ['enrolled', 'graduates']) if prev5_path: print '''\nCalculating {0} 5 year growth'''.format(table_name) previous_df = open_prev_df(prev5_path, table_name, year, indexes) aggregated_df = calc_growth(aggregated_df, previous_df, ['enrolled', 'graduates'], 5) if table_name == "ybuc": print '''Calculating RCAs''' ybc = calc_rca(aggregated_df, year) new_file_path = os.path.abspath(os.path.join(output_path, "ybc_rca.tsv.bz2")) ybc.to_csv(bz2.BZ2File(new_file_path, 'wb'), sep="\t", index=True) file_name = table_name + ".tsv.bz2" print '''Save {0} to output path'''.format(file_name) new_file_path = os.path.abspath(os.path.join(output_path, file_name)) aggregated_df.to_csv(bz2.BZ2File(new_file_path, 'wb'), sep="\t", index=True)
def open_prev_df(prev_path, table_name, year, indexes): prev_file = os.path.join(prev_path, "{0}.tsv.bz2".format(table_name)) previous_df = to_df(prev_file, year, indexes) previous_df = previous_df.reset_index(level="year") previous_df["year"] = int(year) previous_df = previous_df.set_index("year", append=True) previous_df = previous_df.reorder_levels(["year"] + list(previous_df.index.names)[:-1]) return previous_df
def open_prev_df(prev_path, table_name, year, indexes): prev_file = os.path.join(prev_path, "{0}.tsv.bz2".format(table_name)) previous_df = to_df(prev_file, year, indexes) previous_df = previous_df.reset_index(level="year") previous_df["year"] = int(year) previous_df = previous_df.set_index("year", append=True) previous_df = previous_df.reorder_levels( ["year"] + list(previous_df.index.names)[:-1]) return previous_df
def main(file_path, table): # This should create the folder if it's not there yet if not os.path.exists(file_path): os.makedirs(file_path) # Reading the csv, getting a dataframe from pd.read_csv data_frame = to_df(sys.argv[1]) connection = MySQLdb.connect(host=os.environ["DB_HOST"], user=os.environ["DB_USER"], passwd=os.environ["DB_PW"], db=os.environ["DB_NAME"]) data_frame.to_sql(table, connection, flavor='mysql', if_exists='replace', index=False, chunksize=650000)
def main(file_path, year, output_path): pre_check() output_path = os.path.join(output_path, str(year)) if not os.path.exists(output_path): os.makedirs(output_path) print "Output Path=", output_path # d = pd.HDFStore(os.path.abspath(os.path.join(output_path,'sc_data.h5'))) print; print '''STEP 1: \nImport file to pandas dataframe''' hdf_filepath = output_path + "/store_df.h5" print "LOOKING for HDF file at location ", hdf_filepath if os.path.exists(hdf_filepath): print "READING HDF" df = pd.read_hdf(hdf_filepath, 'table') else: print "No HDF file. Need to create DF" df = to_df(file_path, False) print "SAVING HDF to", hdf_filepath df.to_hdf(hdf_filepath, 'table') print; print "Step 2: aggregate" from _aggregate import agg_rules pk_lookup = {"y": "year", "d": "d_id", "b": "bra_id", "c": "course_sc_id", "s": "school_id"} tables_list = ["yb", "ybd", "yd", "ybs", "yc", "ybc", "ybcd"] for table_name in tables_list: pk = [pk_lookup[l] for l in table_name] print "working on", table_name dems = ['gender', 'color', 'loc', 'school_type'] if "d" in table_name else [''] for dem in dems: print '''\nSTEP 2: Aggregate {0}'''.format(dem) tbl = aggregate(table_name, pk, df, dem) if "c" in table_name: pk2 = [x for x in pk] pk2[pk2.index("course_sc_id")] = df.course_sc_id.str.slice(0, 2) tbl_course2 = aggregate(table_name, pk2, df, dem, course_flag=True) tbl = pd.concat([tbl, tbl_course2]) tbl = add_column_length(table_name, tbl) # tbl.rename(columns={"student_id": "students"}, inplace=True) file_name = table_name + "_" + dem + ".tsv.bz2" if "d" in table_name else table_name + ".tsv.bz2" print '''Save {0} to output path'''.format(file_name) new_file_path = os.path.abspath(os.path.join(output_path, file_name)) tbl.to_csv(bz2.BZ2File(new_file_path, 'wb'), sep="\t", index=True)
def calc_growth(year, current_year_file_path, prev_year_file_path, prev5_year_file_path): tbl_name = extract_tbl_name(current_year_file_path) current_year_tbl = to_df(current_year_file_path, tbl_name) for prev_path, years_ago in [(prev_year_file_path, 1), (prev5_year_file_path, 5)]: if not prev_path: continue prev_year_tbl = to_df(prev_path, tbl_name) prev_year_tbl = prev_year_tbl.reset_index(level="year") prev_year_tbl["year"] = int(year) prev_year_tbl = prev_year_tbl.set_index("year", append=True) prev_year_tbl = prev_year_tbl.reorder_levels( ["year"] + list(prev_year_tbl.index.names)[:-1]) current_year_tbl = do_growth(current_year_tbl, prev_year_tbl, years_ago) return (tbl_name, current_year_tbl)
def main(file_path, output_path): # This should create the folder if it's not there yet if not os.path.exists(output_path): os.makedirs(output_path) # Reading the csv, getting a dataframe from pd.read_csv data_frame = to_df(sys.argv[1]) # Filtering data, where Municipality_ID column = 310620 data_frame = data_frame[(data_frame.Month == 1)] # Write output new_file_path = os.path.abspath(os.path.join(output_path, "fout.csv")) data_frame.to_csv(open(new_file_path, "wb+"), sep=";", index=False, float_format="%.3f", encoding="utf-8")
def main(file_path, output_path): # This should create the folder if it's not there yet if not os.path.exists(output_path): os.makedirs(output_path) # Reading the csv, getting a dataframe from pd.read_csv data_frame = to_df(sys.argv[1]) # Filtering data, where Municipality_ID column = 310620 data_frame = data_frame[(data_frame.Month == 1)] # Write output new_file_path = os.path.abspath(os.path.join(output_path, "fout.csv")) data_frame.to_csv(open(new_file_path, 'wb+'), sep=";", index=False, float_format="%.3f", encoding="utf-8")
def main(file_path, year, output_path, prev_path, prev5_path, requireds_only): print; print "~~~**** YEAR: {0} ****~~~".format(year); print; start = time.time() step = 0 # regions state, meso, micro, planning region, munic depths = { "bra": [1, 3, 5, 7, 8, 9], "cnae": [1, 3, 6], "cbo": [1, 4], "demo": [1, 4] } if file_path: if not os.path.exists(output_path): os.makedirs(output_path) d = pd.HDFStore(os.path.join(output_path, 'rais_df_raw.h5')) if "rais_df" in d: rais_df = d['rais_df'] else: step+=1; print; print '''STEP {0}: \nImport file to pandas dataframe'''.format(step) rais_df = to_df(file_path, False) try: d['rais_df'] = rais_df # d.close() except OverflowError: print "WARNING: Unable to save dataframe, Overflow Error." d.close() os.remove(os.path.join(output_path, 'rais_df_raw.h5')) # rais_df = to_df(file_path, False) if "yb" in d and not requireds_only: tables = {"yb":d["yb"], "yo":d["yo"], "yi":d["yi"], "ybi":d["ybi"], "ybo":d["ybo"], "yio":d["yio"], "ybio":d["ybio"]} else: step+=1; print; print '''STEP {0}: \nAggregate'''.format(step) tables = aggregate(rais_df, depths) step+=1; print; print 'STEP {0}: \nImportance'.format(step) tables["yio"] = importance(tables["ybio"], tables["ybi"], tables["yio"], tables["yo"], year, depths) try: d["yb"] = tables["yb"]; d["yo"] = tables["yo"]; d["yi"] = tables["yi"]; d["ybi"] = tables["ybi"]; d["ybo"] = tables["ybo"]; d["yio"] = tables["yio"]; d["ybio"] = tables["ybio"] d.close() except OverflowError: print "WARNING: Unable to save dataframe, Overflow Error." d.close() os.remove(os.path.join(output_path, 'rais_df_raw.h5')) step+=1; print; print 'STEP {0}: \nRequired'.format(step) [tables["ybi"], tables["ybio"]] = required(tables["ybio"], tables["ybi"], tables["yi"], year, depths, output_path) # print tables["ybi"].head() # sys.exit() step+=1; print; print 'STEP {0}: \nDiversity'.format(step) tables["yb"] = calc_diversity(tables["ybi"], tables["yb"], "bra_id", "cnae_id", year, depths) tables["yb"] = calc_diversity(tables["ybo"], tables["yb"], "bra_id", "cbo_id", year, depths) tables["yi"] = calc_diversity(tables["ybi"], tables["yi"], "cnae_id", "bra_id", year, depths) tables["yi"] = calc_diversity(tables["yio"], tables["yi"], "cnae_id", "cbo_id", year, depths) tables["yo"] = calc_diversity(tables["ybo"], tables["yo"], "cbo_id", "bra_id", year, depths) tables["yo"] = calc_diversity(tables["yio"], tables["yo"], "cbo_id", "cnae_id", year, depths) step+=1; print; print 'STEP {0}: \nCalculate RCA, diversity and opportunity gain aka RDO'.format(step) tables["ybi"] = rdo(tables["ybi"], tables["yi"], year, depths) for table_name, table_data in tables.items(): table_data = add_column_length(table_name, table_data) print; print '''FINAL STEP: \nSave files to output path''' for t_name, t in tables.items(): new_file_path = os.path.abspath(os.path.join(output_path, "{0}.tsv.bz2".format(t_name))) t.to_csv(bz2.BZ2File(new_file_path, 'wb'), sep="\t", index=True, float_format="%.3f") if prev_path: print; print '''Calculating growth:''' for current_year_file_path in findFiles(output_path, '*.tsv.bz2'): if "growth" in current_year_file_path: continue current_year_file_name = os.path.basename(current_year_file_path) prev_year_file_path = os.path.join(prev_path, current_year_file_name) prev5_year_file_path = None if prev5_path: prev5_year_file_path = os.path.join(prev5_path, current_year_file_name) if not os.path.exists(prev_year_file_path): print "Unable to find", current_year_file_name, "for previous year." continue tbl_name, tbl_w_growth = calc_growth(year, current_year_file_path, prev_year_file_path, prev5_year_file_path) print tbl_name new_file_path = os.path.abspath(os.path.join(output_path, "{0}_growth.tsv.bz2".format(tbl_name))) tbl_w_growth.to_csv(bz2.BZ2File(new_file_path, 'wb'), sep="\t", index=True, float_format="%.3f") # os.remove(current_year_file_path) print("--- %s minutes ---" % str((time.time() - start)/60))
def main(file_path, trade_flow, year, eci_file_path, pci_file_path, output_path, prev_path, prev5_path): start = time.time() step = 0 depths = { "bra": [1, 3, 5, 7, 8, 9], "hs": [2, 6], "wld": [2, 5] } step += 1; print '''\nSTEP {0}: \nImport file to pandas dataframe'''.format(step) secex_df = to_df(file_path, False) secex_df = secex_df.head(1000) sys.exit() step += 1; print '''\nSTEP {0}: \nAggregate'''.format(step) ybpw = aggregate(secex_df) step += 1; print '''\nSTEP {0}: \nShard'''.format(step) [yb, ybp, ybw, yp, ypw, yw] = shard(ybpw, depths) if trade_flow == "export": step += 1; print '''\nSTEP {0}: \nCalculate PCI & ECI'''.format(step) [yp, yw] = pci_wld_eci(eci_file_path, pci_file_path, yp, yw) step += 1; print '''\nSTEP {0}: \nCalculate domestic ECI'''.format(step) yb = domestic_eci(yp, yb, ybp, depths) step += 1; print '''\nSTEP {0}: \nCalculate diversity'''.format(step) yb = calc_diversity(ybp, yb, "bra_id", "hs_id", depths) yb = calc_diversity(ybw, yb, "bra_id", "wld_id", depths) yp = calc_diversity(ybp, yp, "hs_id", "bra_id", depths) yp = calc_diversity(ypw, yp, "hs_id", "wld_id", depths) yw = calc_diversity(ybw, yw, "wld_id", "bra_id", depths) yw = calc_diversity(ypw, yw, "wld_id", "hs_id", depths) if trade_flow == "export": step += 1; print '''\nSTEP {0}: \nCalculate Brazilian RCA'''.format(step) yp = brazil_rca(yp, year) if trade_flow == "export": step += 1; print '''\nSTEP {0}: \nCalculate RCA, diversity and opp_gain aka RDO'''.format(step) ybp = rdo(ybp, yp, year, depths) if trade_flow == "import": step += 1; print '''\nSTEP {0}: \nCalculate RCD calculation'''.format(step) ybp = rcd(ybp, yp, year, depths) # print ybp.head(20) # sys.exit() tables = {"yb": yb, "yp": yp, "yw": yw, "ybp": ybp, "ybpw": ybpw, "ybw": ybw, "ypw": ypw} if prev_path: step += 1; print '''\nSTEP {0}: \nCalculate 1 year growth'''.format(step) if prev5_path: step += 1; print '''\nSTEP {0}: \nCalculate 5 year growth'''.format(step) for t_name, t in tables.items(): prev_file = os.path.join(prev_path, "{0}.tsv.bz2".format(t_name)) t_prev = to_df(prev_file, t_name) t_prev = t_prev.reset_index(level="year") t_prev["year"] = int(year) t_prev = t_prev.set_index("year", append=True) t_prev = t_prev.reorder_levels(["year"] + list(t_prev.index.names)[:-1]) t = calc_growth(t, t_prev) if prev5_path: prev_file = os.path.join(prev5_path, "{0}.tsv.bz2".format(t_name)) t_prev = to_df(prev_file, t_name) t_prev = t_prev.reset_index(level="year") t_prev["year"] = int(year) t_prev = t_prev.set_index("year", append=True) t_prev = t_prev.reorder_levels(["year"] + list(t_prev.index.names)[:-1]) t = calc_growth(t, t_prev, 5) print "computing column lengths" for table_name, table_data in tables.items(): tables[table_name] = add_column_length(table_name, table_data) print '''\nFINAL STEP: \nSave files to output path''' for t_name, t in tables.items(): if not os.path.exists(output_path): os.makedirs(output_path) new_file_path = os.path.abspath(os.path.join(output_path, "{0}.tsv.bz2".format(t_name))) t.to_csv(bz2.BZ2File(new_file_path, 'wb'), sep="\t", index=True) total_run_time = (time.time() - start) / 60 print; print; print "Total runtime: {0} minutes".format(int(total_run_time)) print; print;
def main(file_path, year, output_path): pre_check() output_path = os.path.join(output_path, str(year)) print "\nYEAR: {0}\n".format(year) this_output_path = os.path.join(output_path) if not os.path.exists(this_output_path): os.makedirs(this_output_path) step = 0 step+=1; print '''STEP {0}: Import file to pandas dataframe'''.format(step) df = to_df(file_path, year) tables_list = ["yb", "ybd", "yd", "ybc", "yc", "ybu", "ybcd", "yu", "yuc", "yucd", "yud"] pk_lookup = {"y": "year", "d": "d_id", "b": "bra_id", "c": "course_hedu_id", "u": "university_id"} ybuc = None for table_name in tables_list: pk = [pk_lookup[l] for l in table_name] print "working on", table_name dems = ['gender', 'ethnicity', 'school_type'] if "d" in table_name else [''] for dem in dems: print '''\nSTEP 2: Aggregate {0}'''.format(dem) tbl = aggregate(pk, df, dem) if "c" in table_name: pk2 = [x for x in pk] pk2[pk2.index("course_hedu_id")] = df.course_hedu_id.str.slice(0, 2) # df2.course_hedu_id = df.course_hedu_id.str.slice(0, 2) tbl_course2 = aggregate(pk2, df, dem) tbl = pd.concat([tbl, tbl_course2]) tbl = add_column_length(table_name, tbl) tbl.rename(columns={"student_id": "students"}, inplace=True) if table_name == "yb": tbl.rename(columns={"university_id": "num_universities"}, inplace=True) if table_name == "ybuc": print tbl.head() ybuc = tbl file_name = table_name + "_" + dem + ".tsv.bz2" if "d" in table_name else table_name + ".tsv.bz2" print '''Save {0} to output path'''.format(file_name) new_file_path = os.path.abspath(os.path.join(output_path, file_name)) tbl.to_csv(bz2.BZ2File(new_file_path, 'wb'), sep="\t", index=True) # if "c" in table_name: # print '''\nSTEP 3: Aggregate {0}''' # tbl = aggregate(pk, df, '', 2) # tbl = add_column_length(table_name, tbl) # # print tbl.reset_index().course_hedu_id.nunique() # file_name = table_name + "_cid2.tsv.bz2" # print '''Save {0} to output path'''.format(file_name) # new_file_path = os.path.abspath(os.path.join(output_path, file_name)) # tbl.to_csv(bz2.BZ2File(new_file_path, 'wb'), sep="\t", index=True) if ybuc is not None: step+=1; print '''STEP {0}: Calculating RCAs'''.format(step) ybc = calc_rca(ybuc, year) new_file_path = os.path.abspath(os.path.join(output_path, "ybc_rca.tsv.bz2")) ybc.to_csv(bz2.BZ2File(new_file_path, 'wb'), sep="\t", index=True) print "writing", new_file_path
def main(year, output_path, attr_type): if "-" in year: years = range(int(year.split('-')[0]), int(year.split('-')[1]) + 1) else: years = [int(year)] print("years:", str(years)) start = time.time() for year in years: d = pd.HDFStore(os.path.join(output_path, str(year), 'rais_df_raw.h5')) if "rais_df" in d: rais_df = d['rais_df'] else: file_path = os.path.join(output_path, 'Rais_{}.csv'.format(year)) rais_df = to_df(file_path) d['rais_df'] = rais_df hist_bins = pd.HDFStore( os.path.join(output_path, '{}_hist_bins.h5'.format(attr_type))) for depth in depths_lookup[attr_type]: print("\n{} depth: {}\n".format(attr_type, depth)) this_depth_df = rais_df.copy() this_depth_df['{}_id'.format(attr_type)] = this_depth_df[ '{}_id'.format(attr_type)].str.slice(0, depth) # uniqs = ['1112', '8401', '8202', '7842', '7621']: uniqs = this_depth_df["{}_id".format(attr_type)].unique() for i, id in enumerate(uniqs): this_id_df = this_depth_df[this_depth_df['{}_id'.format( attr_type)] == id] if len(this_id_df.index) < 10: print("\nNot enough occurences for histogram") continue print("********* {}: {} ({}/{}) *********".format( year, id, i + 1, len(uniqs)), end='\r') sys.stdout.flush() if int(year) == latest_year: wage = this_id_df["wage"] wmin = rounddown(wage.mean() - (wage.std() * 2)) wmin = 0 if wmin < 0 else wmin wmax = rounddown(wage.mean() + (wage.std() * 2)) wrange = wmax - wmin # print wrange bin_size = 100 if wrange > 3000: bin_size = 200 if wrange > 5000: bin_size = 500 if wrange > 10000: bin_size = 1000 ''' !!! exception for regions (all need to have same bins!) !!! ''' if attr_type == "bra" and depth == 1: bin_size = 200 wmin = 0 wmax = 5200 hist_bins["{}_{}".format(attr_type, id)] = pd.Series( [wmin, wmax, bin_size]) else: if "{}_{}".format(attr_type, id) in hist_bins: wmin, wmax, bin_size = hist_bins["{}_{}".format( attr_type, id)] else: continue hist(id, this_id_df, wmin, wmax, bin_size, attr_type, year) d.close() hist_bins.close() time_elapsed = "%s minutes" % str((time.time() - start) / 60) print('''\nTotal time %s''' % time_elapsed) print('''\nSending alert e-mail''') client = sendgrid.SendGridClient(os.environ['SENDGRID_API_KEY']) message = sendgrid.Mail() message.add_to(os.environ.get('ADMIN_EMAIL', '*****@*****.**')) message.set_from("*****@*****.**") message.set_subject("Rais histogram for %s ready!" % year) message.set_html( "Your calculation took %s, please check out the output at the calc-server" % time_elapsed) client.send(message)
def main(file_path, year, output_path, prev_path, prev5_path): start = time.time() step = 0 geo_depths = [2, 4, 6, 7, 8] # state, meso, micro, planning region, munic d = pd.HDFStore(os.path.abspath(os.path.join(output_path, 'rais.h5'))) step += 1 print print '''STEP {0}: \nImport file to pandas dataframe'''.format(step) rais_df = to_df(file_path, False, calc_d_id=True) step += 1 print print '''STEP {0}: \nAggregate with Demographics'''.format(step) tables = aggregate_demographics(rais_df, geo_depths) for table_name, table_data in tables.items(): table_data = add_column_length(table_name, table_data) # print table_data.head() if prev_path: step += 1 print print '''STEP {0}: \nCalculate 1 year growth'''.format(step) for t_name, t in tables.items(): prev_file = os.path.join(prev_path, "{0}.tsv.bz2".format(t_name)) t_prev = to_df(prev_file, t_name, calc_d_id=True) t_prev = t_prev.reset_index(level="year") t_prev["year"] = int(year) t_prev = t_prev.set_index("year", append=True) t_prev = t_prev.reorder_levels(["year"] + list(t_prev.index.names)[:-1]) tables[t_name] = calc_growth(t, t_prev) print tables[t_name].head() # sys.exit() if prev5_path: step += 1 print print '''STEP {0}: \nCalculate 5 year growth'''.format(step) prev_file = os.path.join(prev5_path, "{0}.tsv.bz2".format(t_name)) t_prev = to_df(prev_file, t_name, calc_d_id=True) t_prev = t_prev.reset_index(level="year") t_prev["year"] = int(year) t_prev = t_prev.set_index("year", append=True) t_prev = t_prev.reorder_levels(["year"] + list(t_prev.index.names)[:-1]) # t_prev = to_df(prev_file, t_name) tables[t_name] = calc_growth(tables[t_name], t_prev, 5) print print '''FINAL STEP: \nSave files to output path''' for t_name, t in tables.items(): if not os.path.exists(output_path): os.makedirs(output_path) new_file_path = os.path.abspath( os.path.join(output_path, "{0}.tsv.bz2".format(t_name))) t.to_csv(bz2.BZ2File(new_file_path, 'wb'), sep="\t", index=True, float_format="%.2f") print("--- %s minutes ---" % str((time.time() - start) / 60))
import sys from _to_df import to_df if len(sys.argv) < 2: sys.exit("Usage: print.py csv_file.csv") df = to_df(sys.argv[1]) print df.head()
def main(file_path, year, output_path, prev_path, prev5_path): print "\nHEDU YEAR: {0}\n".format(year) pre_check() output_path = os.path.join(output_path, str(year)) if not os.path.exists(output_path): os.makedirs(output_path) hdf_store = pd.HDFStore( os.path.abspath(os.path.join(output_path, 'hedu_data.h5'))) print '''\nImport file to pandas dataframe''' if "hedu_df" in hdf_store: hedu_df = hdf_store['hedu_df'] else: hedu_df = to_df(file_path, year) try: hdf_store['hedu_df'] = hedu_df except OverflowError: print "WARNING: Unable to save dataframe, Overflow Error." hdf_store.close() os.remove(os.path.join(output_path, 'hedu_data.h5')) tables_list = ["yb", "yu", "yc", "ybc", "ybu", "yuc", "ybuc"] index_lookup = { "y": "year", "b": "bra_id", "c": "course_hedu_id", "u": "university_id" } for table_name in tables_list: indexes = [index_lookup[l] for l in table_name] print '''\nAggregating {0}'''.format(table_name) aggregated_df = aggregate(indexes, hedu_df) print '''Adding length column to {0}'''.format(table_name) aggregated_df = add_column_length(table_name, aggregated_df) print '''Renaming {0} columns'''.format(table_name) aggregated_df.rename(columns={"student_id": "students"}, inplace=True) if 'u' not in table_name: aggregated_df.rename(columns={"university_id": "num_universities"}, inplace=True) if prev_path: print '''\nCalculating {0} 1 year growth'''.format(table_name) previous_df = open_prev_df(prev_path, table_name, year, indexes) aggregated_df = calc_growth(aggregated_df, previous_df, ['enrolled', 'graduates']) if prev5_path: print '''\nCalculating {0} 5 year growth'''.format(table_name) previous_df = open_prev_df(prev5_path, table_name, year, indexes) aggregated_df = calc_growth(aggregated_df, previous_df, ['enrolled', 'graduates'], 5) if table_name == "ybuc": print '''Calculating RCAs''' ybc = calc_rca(aggregated_df, year) new_file_path = os.path.abspath( os.path.join(output_path, "ybc_rca.tsv.bz2")) ybc.to_csv(bz2.BZ2File(new_file_path, 'wb'), sep="\t", index=True) file_name = table_name + ".tsv.bz2" print '''Save {0} to output path'''.format(file_name) new_file_path = os.path.abspath(os.path.join(output_path, file_name)) aggregated_df.to_csv(bz2.BZ2File(new_file_path, 'wb'), sep="\t", index=True)
def main(file_path, year, output_path, prev_path, prev5_path): print "\nSC YEAR: {0}\n".format(year) start = time.time() pre_check() output_path = os.path.join(output_path, str(year)) if not os.path.exists(output_path): os.makedirs(output_path) hdf_store = pd.HDFStore( os.path.abspath(os.path.join(output_path, 'sc_data.h5'))) print '''\nImport file to pandas dataframe''' if "sc_df" in hdf_store: sc_df = hdf_store['sc_df'] else: sc_df = to_df(file_path) try: hdf_store['sc_df'] = sc_df except OverflowError: print "WARNING: Unable to save dataframe, Overflow Error." hdf_store.close() os.remove(os.path.join(output_path, 'sc_data.h5')) tables_list = ["yb", "yc", "ys", "ybs", "ybc", "ysc", "ybsc"] index_lookup = {"y": "year", "b": "bra_id", "c": "course_sc_id", "s": "school_id"} for table_name in tables_list: indexes = [index_lookup[l] for l in table_name] print '''\nAggregating {0}'''.format(table_name) aggregated_df = aggregate(indexes, sc_df) print '''Adding length column to {0}'''.format(table_name) aggregated_df = add_column_length(table_name, aggregated_df) print '''Renaming {0} columns'''.format(table_name) aggregated_df.rename(columns={"enroll_id": "enrolled"}, inplace=True) aggregated_df.rename(columns={"class_id": "classes"}, inplace=True) if 's' not in table_name: aggregated_df.rename(columns={"school_id": "num_schools"}, inplace=True) if prev_path: print '''\nCalculating {0} 1 year growth'''.format(table_name) previous_df = open_prev_df(prev_path, table_name, year, indexes) aggregated_df = calc_growth(aggregated_df, previous_df, ['enrolled']) if prev5_path: print '''\nCalculating {0} 5 year growth'''.format(table_name) previous_df = open_prev_df(prev5_path, table_name, year, indexes) aggregated_df = calc_growth(aggregated_df, previous_df, ['enrolled'], 5) file_name = table_name + ".tsv.bz2" print '''\nSave {0} to output path'''.format(file_name) new_file_path = os.path.abspath(os.path.join(output_path, file_name)) aggregated_df.to_csv(bz2.BZ2File(new_file_path, 'wb'), sep="\t", index=True) time_elapsed = "%s minutes" % str((time.time() - start) / 60) print '''\nTotal time %s''' % time_elapsed print '''\nSending alert e-mail''' client = sendgrid.SendGridClient(os.environ['SENDGRID_API_KEY']) message = sendgrid.Mail() message.add_to(os.environ.get('ADMIN_EMAIL', '*****@*****.**')) message.set_from("*****@*****.**") message.set_subject("Scholar census %s ready!" % year) message.set_html("Your calculation took %s, please check out the output at the calc-server" % time_elapsed) client.send(message)
def main(file_path, year, output_path): pre_check() output_path = os.path.join(output_path, str(year)) print "\nYEAR: {0}\n".format(year) this_output_path = os.path.join(output_path) if not os.path.exists(this_output_path): os.makedirs(this_output_path) step = 0 step += 1 print '''STEP {0}: Import file to pandas dataframe'''.format(step) df = to_df(file_path, year) tables_list = [ "yb", "ybd", "yd", "ybc", "yc", "ybu", "ybcd", "yu", "yuc", "yucd", "yud" ] pk_lookup = { "y": "year", "d": "d_id", "b": "bra_id", "c": "course_hedu_id", "u": "university_id" } ybuc = None for table_name in tables_list: pk = [pk_lookup[l] for l in table_name] print "working on", table_name dems = ['gender', 'ethnicity', 'school_type' ] if "d" in table_name else [''] for dem in dems: print '''\nSTEP 2: Aggregate {0}'''.format(dem) tbl = aggregate(pk, df, dem) if "c" in table_name: pk2 = [x for x in pk] pk2[pk2.index("course_hedu_id")] = df.course_hedu_id.str.slice( 0, 2) # df2.course_hedu_id = df.course_hedu_id.str.slice(0, 2) tbl_course2 = aggregate(pk2, df, dem) tbl = pd.concat([tbl, tbl_course2]) tbl = add_column_length(table_name, tbl) tbl.rename(columns={"student_id": "students"}, inplace=True) if table_name == "yb": tbl.rename(columns={"university_id": "num_universities"}, inplace=True) if table_name == "ybuc": print tbl.head() ybuc = tbl file_name = table_name + "_" + dem + ".tsv.bz2" if "d" in table_name else table_name + ".tsv.bz2" print '''Save {0} to output path'''.format(file_name) new_file_path = os.path.abspath( os.path.join(output_path, file_name)) tbl.to_csv(bz2.BZ2File(new_file_path, 'wb'), sep="\t", index=True) # if "c" in table_name: # print '''\nSTEP 3: Aggregate {0}''' # tbl = aggregate(pk, df, '', 2) # tbl = add_column_length(table_name, tbl) # # print tbl.reset_index().course_hedu_id.nunique() # file_name = table_name + "_cid2.tsv.bz2" # print '''Save {0} to output path'''.format(file_name) # new_file_path = os.path.abspath(os.path.join(output_path, file_name)) # tbl.to_csv(bz2.BZ2File(new_file_path, 'wb'), sep="\t", index=True) if ybuc is not None: step += 1 print '''STEP {0}: Calculating RCAs'''.format(step) ybc = calc_rca(ybuc, year) new_file_path = os.path.abspath( os.path.join(output_path, "ybc_rca.tsv.bz2")) ybc.to_csv(bz2.BZ2File(new_file_path, 'wb'), sep="\t", index=True) print "writing", new_file_path
def main(file_path, year, output_path, prev_path, prev5_path): print "\nSC YEAR: {0}\n".format(year) start = time.time() pre_check() output_path = os.path.join(output_path, str(year)) if not os.path.exists(output_path): os.makedirs(output_path) hdf_store = pd.HDFStore( os.path.abspath(os.path.join(output_path, 'sc_data.h5'))) print '''\nImport file to pandas dataframe''' if "sc_df" in hdf_store: sc_df = hdf_store['sc_df'] else: sc_df = to_df(file_path) try: hdf_store['sc_df'] = sc_df except OverflowError: print "WARNING: Unable to save dataframe, Overflow Error." hdf_store.close() os.remove(os.path.join(output_path, 'sc_data.h5')) tables_list = ["yb", "yc", "ys", "ybs", "ybc", "ysc", "ybsc"] index_lookup = { "y": "year", "b": "bra_id", "c": "course_sc_id", "s": "school_id" } for table_name in tables_list: indexes = [index_lookup[l] for l in table_name] print '''\nAggregating {0}'''.format(table_name) aggregated_df = aggregate(indexes, sc_df) print '''Adding length column to {0}'''.format(table_name) aggregated_df = add_column_length(table_name, aggregated_df) print '''Renaming {0} columns'''.format(table_name) aggregated_df.rename(columns={"enroll_id": "enrolled"}, inplace=True) aggregated_df.rename(columns={"class_id": "classes"}, inplace=True) if 's' not in table_name: aggregated_df.rename(columns={"school_id": "num_schools"}, inplace=True) if prev_path: print '''\nCalculating {0} 1 year growth'''.format(table_name) previous_df = open_prev_df(prev_path, table_name, year, indexes) aggregated_df = calc_growth(aggregated_df, previous_df, ['enrolled']) if prev5_path: print '''\nCalculating {0} 5 year growth'''.format(table_name) previous_df = open_prev_df(prev5_path, table_name, year, indexes) aggregated_df = calc_growth(aggregated_df, previous_df, ['enrolled'], 5) file_name = table_name + ".tsv.bz2" print '''\nSave {0} to output path'''.format(file_name) new_file_path = os.path.abspath(os.path.join(output_path, file_name)) aggregated_df.to_csv(bz2.BZ2File(new_file_path, 'wb'), sep="\t", index=True) time_elapsed = "%s minutes" % str((time.time() - start) / 60) print '''\nTotal time %s''' % time_elapsed print '''\nSending alert e-mail''' client = sendgrid.SendGridClient(os.environ['SENDGRID_API_KEY']) message = sendgrid.Mail() message.add_to(os.environ.get('ADMIN_EMAIL', '*****@*****.**')) message.set_from("*****@*****.**") message.set_subject("Scholar census %s ready!" % year) message.set_html( "Your calculation took %s, please check out the output at the calc-server" % time_elapsed) client.send(message)
def main(export_file_path, import_file_path, year, eci_file_path, pci_file_path, ypw_file_path, output_path, prev_path, prev5_path): output_path = os.path.join(output_path, str(year)) start = time.time() step = 0 depths = {"bra": [1, 3, 5, 7, 9], "hs": [2, 6], "wld": [2, 5]} if not os.path.exists(output_path): os.makedirs(output_path) d = pd.HDFStore(os.path.join(output_path, 'secex.h5')) # if "ymb" in d: if "ymbp" in d: tables = {} tables["ymb"] = d["ymb"] tables["ymp"] = d["ymp"] tables["ymw"] = d["ymw"] tables["ymbp"] = d["ymbp"] tables["ymbw"] = d["ymbw"] tables["ympw"] = d["ympw"] tables["ymbpw"] = d["ymbpw"] else: step += 1 print '''\nSTEP {0}: \nImport file to pandas dataframe'''.format(step) secex_exports = to_df(export_file_path, False) secex_imports = to_df(import_file_path, False) step += 1 print '''\nSTEP {0}: \nMerge imports and exports'''.format(step) secex_df = merge(secex_exports, secex_imports) step += 1 print '''\nSTEP {0}: \nAggregate'''.format(step) ymbpw = aggregate(secex_df) step += 1 print '''\nSTEP {0}: \nShard'''.format(step) [ymb, ymbp, ymbw, ymp, ympw, ymw] = shard(ymbpw) step += 1 print '''\nSTEP {0}: \nCalculate PCI & ECI'''.format(step) [ymp, ymw] = pci_wld_eci(eci_file_path, pci_file_path, ymp, ymw, year) step += 1 print '''\nSTEP {0}: \nCalculate diversity'''.format(step) ymb = calc_diversity(ymbp, ymb, "bra_id", "hs_id") ymb = calc_diversity(ymbw, ymb, "bra_id", "wld_id") ymp = calc_diversity(ymbp, ymp, "hs_id", "bra_id") ymp = calc_diversity(ympw, ymp, "hs_id", "wld_id") ymw = calc_diversity(ymbw, ymw, "wld_id", "bra_id") ymw = calc_diversity(ympw, ymw, "wld_id", "hs_id") step += 1 print '''\nSTEP {0}: \nCalculate domestic ECI'''.format(step) ymb = domestic_eci(ymp, ymb, ymbp, depths["bra"]) step += 1 print '''\nSTEP {0}: \nCalculate domestic ECI'''.format(step) ymb = domestic_eci(ymp, ymb, ymbp, depths["bra"]) step += 1 print '''\nSTEP {0}: \nCalculate Brazilian RCA'''.format(step) ymp = brazil_rca(ymp, ypw_file_path, year) step += 1 print '''\nSTEP {0}: \nCalculate RCA, diversity and opp_gain aka RDO'''.format( step) ymbp = rdo(ymbp, ymp, year, depths["bra"], ypw_file_path) tables = { "ymb": ymb, "ymp": ymp, "ymw": ymw, "ymbp": ymbp, "ymbpw": ymbpw, "ymbw": ymbw, "ympw": ympw } for tbln, tbl in tables.items(): d[tbln] = tbl if prev_path: step += 1 print '''\nSTEP {0}: \nCalculate 1 year growth'''.format(step) if prev5_path: step += 1 print '''\nSTEP {0}: \nCalculate 5 year growth'''.format(step) for t_name, t in tables.items(): print t_name prev_file = os.path.join(prev_path, "{0}.tsv.bz2".format(t_name)) t_prev = to_df(prev_file, t_name) t_prev = t_prev.reset_index(level="year") t_prev["year"] = int(year) t_prev = t_prev.set_index("year", append=True) t_prev = t_prev.reorder_levels(["year"] + list(t_prev.index.names)[:-1]) t = calc_growth(t, t_prev) if prev5_path: prev_file = os.path.join(prev5_path, "{0}.tsv.bz2".format(t_name)) t_prev = to_df(prev_file, t_name) t_prev = t_prev.reset_index(level="year") t_prev["year"] = int(year) t_prev = t_prev.set_index("year", append=True) t_prev = t_prev.reorder_levels(["year"] + list(t_prev.index.names)[:-1]) t = calc_growth(t, t_prev, 5) print "computing column lengths" for table_name, table_data in tables.items(): tables[table_name] = add_column_length(table_name, table_data) print '''\nFINAL STEP: \nSave files to output path''' for t_name, t in tables.items(): if not os.path.exists(output_path): os.makedirs(output_path) new_file_path = os.path.abspath( os.path.join(output_path, "{0}.tsv.bz2".format(t_name))) t.to_csv(bz2.BZ2File(new_file_path, 'wb'), sep="\t", index=True) total_run_time = (time.time() - start) / 60 print print print "Total runtime: {0} minutes".format(int(total_run_time)) print print
def main(file_path, year, output_path, prev_path, prev5_path, requireds_only): print print "~~~**** YEAR: {0} ****~~~".format(year) print start = time.time() step = 0 # regions state, meso, micro, planning region, munic depths = { "bra": [1, 3, 5, 7, 8, 9], "cnae": [1, 3, 6], "cbo": [1, 4], "demo": [1, 4] } if file_path: if not os.path.exists(output_path): os.makedirs(output_path) d = pd.HDFStore(os.path.join(output_path, 'rais_df_raw.h5')) if "rais_df" in d: rais_df = d['rais_df'] else: step += 1 print print '''STEP {0}: \nImport file to pandas dataframe'''.format( step) rais_df = to_df(file_path, False) try: d['rais_df'] = rais_df # d.close() except OverflowError: print "WARNING: Unable to save dataframe, Overflow Error." d.close() os.remove(os.path.join(output_path, 'rais_df_raw.h5')) # rais_df = to_df(file_path, False) if "yb" in d and not requireds_only: tables = { "yb": d["yb"], "yo": d["yo"], "yi": d["yi"], "ybi": d["ybi"], "ybo": d["ybo"], "yio": d["yio"], "ybio": d["ybio"] } else: step += 1 print print '''STEP {0}: \nAggregate'''.format(step) tables = aggregate(rais_df, depths) step += 1 print print 'STEP {0}: \nImportance'.format(step) tables["yio"] = importance(tables["ybio"], tables["ybi"], tables["yio"], tables["yo"], year, depths) try: d["yb"] = tables["yb"] d["yo"] = tables["yo"] d["yi"] = tables["yi"] d["ybi"] = tables["ybi"] d["ybo"] = tables["ybo"] d["yio"] = tables["yio"] d["ybio"] = tables["ybio"] d.close() except OverflowError: print "WARNING: Unable to save dataframe, Overflow Error." d.close() os.remove(os.path.join(output_path, 'rais_df_raw.h5')) step += 1 print print 'STEP {0}: \nRequired'.format(step) [tables["ybi"], tables["ybio"]] = required(tables["ybio"], tables["ybi"], tables["yi"], year, depths, output_path) # print tables["ybi"].head() # sys.exit() step += 1 print print 'STEP {0}: \nDiversity'.format(step) tables["yb"] = calc_diversity(tables["ybi"], tables["yb"], "bra_id", "cnae_id", year, depths) tables["yb"] = calc_diversity(tables["ybo"], tables["yb"], "bra_id", "cbo_id", year, depths) tables["yi"] = calc_diversity(tables["ybi"], tables["yi"], "cnae_id", "bra_id", year, depths) tables["yi"] = calc_diversity(tables["yio"], tables["yi"], "cnae_id", "cbo_id", year, depths) tables["yo"] = calc_diversity(tables["ybo"], tables["yo"], "cbo_id", "bra_id", year, depths) tables["yo"] = calc_diversity(tables["yio"], tables["yo"], "cbo_id", "cnae_id", year, depths) step += 1 print print 'STEP {0}: \nCalculate RCA, diversity and opportunity gain aka RDO'.format( step) tables["ybi"] = rdo(tables["ybi"], tables["yi"], year, depths) for table_name, table_data in tables.items(): table_data = add_column_length(table_name, table_data) print print '''FINAL STEP: \nSave files to output path''' for t_name, t in tables.items(): new_file_path = os.path.abspath( os.path.join(output_path, "{0}.tsv.bz2".format(t_name))) t.to_csv(bz2.BZ2File(new_file_path, 'wb'), sep="\t", index=True, float_format="%.3f") if prev_path: print print '''Calculating growth:''' for current_year_file_path in findFiles(output_path, '*.tsv.bz2'): if "growth" in current_year_file_path: continue current_year_file_name = os.path.basename(current_year_file_path) prev_year_file_path = os.path.join(prev_path, current_year_file_name) prev5_year_file_path = None if prev5_path: prev5_year_file_path = os.path.join(prev5_path, current_year_file_name) if not os.path.exists(prev_year_file_path): print "Unable to find", current_year_file_name, "for previous year." continue tbl_name, tbl_w_growth = calc_growth(year, current_year_file_path, prev_year_file_path, prev5_year_file_path) print tbl_name new_file_path = os.path.abspath( os.path.join(output_path, "{0}_growth.tsv.bz2".format(tbl_name))) tbl_w_growth.to_csv(bz2.BZ2File(new_file_path, 'wb'), sep="\t", index=True, float_format="%.3f") # os.remove(current_year_file_path) print("--- %s minutes ---" % str((time.time() - start) / 60))
def main(output_path, start_from): start = time.time() print "Reading rais_df_raw.h5..." if not os.path.exists(output_path): os.makedirs(output_path) d = pd.HDFStore(os.path.join(output_path, 'rais_df_raw.h5')) if "rais_df" in d: rais_df = d['rais_df'] else: file_path = raw_input('No rais_df_raw.h5 found, raw file path: ') rais_df = to_df(file_path, False) d.close() year = int(rais_df["year"].unique().tolist()[0]) rais_df = rais_df.drop(["year", "age", "color", "gender", "literacy", "d_id"], axis=1) cnaes = set(rais_df.cnae_id.unique()) cnaes = cnaes.union({c[:3] for c in cnaes}) cnaes = cnaes.union({c[:1] for c in cnaes}) cnaes = list(cnaes) cnaes.sort() if start_from: cnaes = cnaes[cnaes.index(start_from):] last_seen = {d:{"id":None, "data":None} for d in depths["cnae"][:-1]} for i, cnae in enumerate(cnaes): s = time.time() prev_id_len = depths["cnae"].index(len(cnae))-1 prev_id_len = depths["cnae"][prev_id_len] if prev_id_len in last_seen and cnae[:prev_id_len] == last_seen[prev_id_len]["id"]: cnae_df = last_seen[prev_id_len]["data"][last_seen[prev_id_len]["data"].cnae_id.str.startswith(cnae)] else: cnae_df = rais_df[rais_df.cnae_id.str.startswith(cnae)] if len(cnae) in last_seen: last_seen[len(cnae)]["id"] = cnae last_seen[len(cnae)]["data"] = cnae_df.copy() cnae_df = cnae_df[cnae_df.cnae_id.str.startswith(cnae)] if len(cnae) < 6: cnae_df["cnae_id"] = cnae_df["cnae_id"].str.slice(0, len(cnae)) cnae_df = cnae_df[cnae_df["cnae_id"]==cnae] # cnae_df['est_id'] = cnae_df['est_id'].str.cat(cnae_df['est_size'].values.astype(str), sep='_') # cnae_df = cnae_df.drop(["est_size"], axis=1) ests = cnae_df.groupby(["est_size", "est_id"]).agg({"wage":pd.Series.median}) bounds_lower = ests.groupby(level=["est_size"]).agg({"wage":lambda x: x.quantile(0.25)}) bounds_upper = ests.groupby(level=["est_size"]).agg({"wage":lambda x: x.quantile(0.75)}) ests = ests.reset_index(level=["est_id"]) ests["lower"] = bounds_lower["wage"] ests["upper"] = bounds_upper["wage"] # print ests.shape ests = ests[(ests["wage"]>=ests["lower"]) & (ests["wage"]<=ests["upper"])] # print ests.shape # print ests["est_id"].count(), ests["est_id"].nunique() # cnae_df["est_qualify"] = cnae_df["est_id"].apply(lambda x: x in ests["est_id"].tolist()) # cnae_df = cnae_df[cnae_df["est_qualify"]] cnae_df = cnae_df[cnae_df["est_id"].isin(ests["est_id"])] if cnae_df.empty: continue num_ests = ests.groupby(level=0).agg({"est_id":pd.Series.count}) # cbos = cnae_df.groupby(["est_size", "cbo_id"]).agg({"est_id":lambda x: set.union(set(x)), "num_emp":pd.Series.count, "wage":pd.Series.median}) cbos = cnae_df.groupby(["est_size", "cbo_id"]).agg({"est_id": pd.Series.nunique, "num_emp":pd.Series.count, "wage":pd.Series.median}) cbos = cbos.reset_index(level=["cbo_id"]) # cbos['test'] = cbos.est_id.str.len() cbos['num_est'] = num_ests["est_id"] cbos['qualify'] = cbos["est_id"] / cbos["num_est"] cbos = cbos[cbos['qualify'] >= 0.2] cbos["ene"] = cbos["num_emp"] / cbos["est_id"] cbos = cbos.reset_index().set_index(["cbo_id", "est_size"]) # print cbos["ene"].head() ene = cbos["ene"].unstack(level=-1) ene = ene.rename(columns={0:'ene_micro', 1:'ene_small', 2:'ene_medium', 3:'ene_large'}) ene["cnae_id"] = cnae ene["year"] = year ene = ene.set_index(["year", "cnae_id"], append=True) ene = ene.reorder_levels(["year", "cnae_id", "cbo_id"]) print cnae, (time.time() - s) / 60 fname_yio = os.path.join(output_path, 'ene.csv') if i == 0 and not start_from: ene.to_csv(fname_yio) else: ene.to_csv(open(fname_yio, 'a'), header=False) print "Done! Merging..." index_col = ["year", "cnae_id", "cbo_id"] full_tbl = pd.read_csv(os.path.join(output_path, "yio.tsv.bz2"), sep="\t", compression="bz2", converters={"cbo_id":str, "cnae_id":str, "year":int}) full_tbl = full_tbl.set_index(index_col) ene_tbl = pd.read_csv(os.path.join(output_path, "ene.csv"), converters={"cbo_id":str, "cnae_id":str, "year":int}) ene_tbl = ene_tbl.set_index(index_col) full_tbl = full_tbl.join(ene_tbl, how='outer') new_file_path = os.path.abspath(os.path.join(output_path, "yio_ene.tsv.bz2")) full_tbl.to_csv(bz2.BZ2File(new_file_path, 'wb'), sep="\t", index=True, float_format="%.2f") print("--- %s minutes ---" % str((time.time() - start)/60))
def main(file_path, trade_flow, year, eci_file_path, pci_file_path, output_path, prev_path, prev5_path): start = time.time() step = 0 depths = {"bra": [1, 3, 5, 7, 8, 9], "hs": [2, 6], "wld": [2, 5]} step += 1 print '''\nSTEP {0}: \nImport file to pandas dataframe'''.format(step) secex_df = to_df(file_path, False) secex_df = secex_df.head(1000) sys.exit() step += 1 print '''\nSTEP {0}: \nAggregate'''.format(step) ybpw = aggregate(secex_df) step += 1 print '''\nSTEP {0}: \nShard'''.format(step) [yb, ybp, ybw, yp, ypw, yw] = shard(ybpw, depths) if trade_flow == "export": step += 1 print '''\nSTEP {0}: \nCalculate PCI & ECI'''.format(step) [yp, yw] = pci_wld_eci(eci_file_path, pci_file_path, yp, yw) step += 1 print '''\nSTEP {0}: \nCalculate domestic ECI'''.format(step) yb = domestic_eci(yp, yb, ybp, depths) step += 1 print '''\nSTEP {0}: \nCalculate diversity'''.format(step) yb = calc_diversity(ybp, yb, "bra_id", "hs_id", depths) yb = calc_diversity(ybw, yb, "bra_id", "wld_id", depths) yp = calc_diversity(ybp, yp, "hs_id", "bra_id", depths) yp = calc_diversity(ypw, yp, "hs_id", "wld_id", depths) yw = calc_diversity(ybw, yw, "wld_id", "bra_id", depths) yw = calc_diversity(ypw, yw, "wld_id", "hs_id", depths) if trade_flow == "export": step += 1 print '''\nSTEP {0}: \nCalculate Brazilian RCA'''.format(step) yp = brazil_rca(yp, year) if trade_flow == "export": step += 1 print '''\nSTEP {0}: \nCalculate RCA, diversity and opp_gain aka RDO'''.format( step) ybp = rdo(ybp, yp, year, depths) if trade_flow == "import": step += 1 print '''\nSTEP {0}: \nCalculate RCD calculation'''.format(step) ybp = rcd(ybp, yp, year, depths) # print ybp.head(20) # sys.exit() tables = { "yb": yb, "yp": yp, "yw": yw, "ybp": ybp, "ybpw": ybpw, "ybw": ybw, "ypw": ypw } if prev_path: step += 1 print '''\nSTEP {0}: \nCalculate 1 year growth'''.format(step) if prev5_path: step += 1 print '''\nSTEP {0}: \nCalculate 5 year growth'''.format(step) for t_name, t in tables.items(): prev_file = os.path.join(prev_path, "{0}.tsv.bz2".format(t_name)) t_prev = to_df(prev_file, t_name) t_prev = t_prev.reset_index(level="year") t_prev["year"] = int(year) t_prev = t_prev.set_index("year", append=True) t_prev = t_prev.reorder_levels(["year"] + list(t_prev.index.names)[:-1]) t = calc_growth(t, t_prev) if prev5_path: prev_file = os.path.join(prev5_path, "{0}.tsv.bz2".format(t_name)) t_prev = to_df(prev_file, t_name) t_prev = t_prev.reset_index(level="year") t_prev["year"] = int(year) t_prev = t_prev.set_index("year", append=True) t_prev = t_prev.reorder_levels(["year"] + list(t_prev.index.names)[:-1]) t = calc_growth(t, t_prev, 5) print "computing column lengths" for table_name, table_data in tables.items(): tables[table_name] = add_column_length(table_name, table_data) print '''\nFINAL STEP: \nSave files to output path''' for t_name, t in tables.items(): if not os.path.exists(output_path): os.makedirs(output_path) new_file_path = os.path.abspath( os.path.join(output_path, "{0}.tsv.bz2".format(t_name))) t.to_csv(bz2.BZ2File(new_file_path, 'wb'), sep="\t", index=True) total_run_time = (time.time() - start) / 60 print print print "Total runtime: {0} minutes".format(int(total_run_time)) print print
def main(export_file_path, import_file_path, year, eci_file_path, pci_file_path, ypw_file_path, output_path): start = time.time() step = 0 depths = { "bra": [1, 3, 5, 7, 9], "hs": [2, 6], "wld": [2, 5] } if not os.path.exists(output_path): os.makedirs(output_path) d = pd.HDFStore(os.path.join(output_path, 'secex.h5')) # if "ymb" in d: if "ymbp" in d: tables = {} tables["ymb"] = d["ymb"]; tables["ymp"] = d["ymp"]; tables["ymw"] = d["ymw"]; tables["ymbp"] = d["ymbp"]; tables["ymbw"] = d["ymbw"]; tables["ympw"] = d["ympw"]; tables["ymbpw"] = d["ymbpw"] else: step += 1; print '''\nSTEP {0}: \nImport file to pandas dataframe'''.format(step) secex_exports = to_df(export_file_path, False) secex_imports = to_df(import_file_path, False) # secex_exports = secex_exports.head(1000) # secex_imports = secex_imports.head(1000) step += 1; print '''\nSTEP {0}: \nMerge imports and exports'''.format(step) secex_df = merge(secex_exports, secex_imports) step += 1; print '''\nSTEP {0}: \nAggregate'''.format(step) ymbpw = aggregate(secex_df) step += 1; print '''\nSTEP {0}: \nShard'''.format(step) [ymb, ymbp, ymbw, ymp, ympw, ymw] = shard(ymbpw) step += 1; print '''\nSTEP {0}: \nCalculate PCI & ECI'''.format(step) [ymp, ymw] = pci_wld_eci(eci_file_path, pci_file_path, ymp, ymw, year) step += 1; print '''\nSTEP {0}: \nCalculate diversity'''.format(step) ymb = calc_diversity(ymbp, ymb, "bra_id", "hs_id") ymb = calc_diversity(ymbw, ymb, "bra_id", "wld_id") ymp = calc_diversity(ymbp, ymp, "hs_id", "bra_id") ymp = calc_diversity(ympw, ymp, "hs_id", "wld_id") ymw = calc_diversity(ymbw, ymw, "wld_id", "bra_id") ymw = calc_diversity(ympw, ymw, "wld_id", "hs_id") step += 1; print '''\nSTEP {0}: \nCalculate domestic ECI'''.format(step) ymb = domestic_eci(ymp, ymb, ymbp, depths["bra"]) step += 1; print '''\nSTEP {0}: \nCalculate Brazilian RCA'''.format(step) ymp = brazil_rca(ymp, ypw_file_path, year) step += 1; print '''\nSTEP {0}: \nCalculate RCA, diversity and opp_gain aka RDO'''.format(step) ymbp = rdo(ymbp, ymp, year, depths["bra"], ypw_file_path) tables = {"ymb": ymb, "ymp": ymp, "ymw": ymw, "ymbp": ymbp, "ymbpw": ymbpw, "ymbw": ymbw, "ympw": ympw} for tbln, tbl in tables.items(): d[tbln] = tbl print "computing column lengths" for table_name, table_data in tables.items(): tables[table_name] = add_column_length(table_name, table_data) print '''\nFINAL STEP: \nSave files to output path''' for t_name, t in tables.items(): if not os.path.exists(output_path): os.makedirs(output_path) new_file_path = os.path.abspath(os.path.join(output_path, "{0}.tsv.bz2".format(t_name))) t.to_csv(bz2.BZ2File(new_file_path, 'wb'), sep="\t", index=True) total_run_time = (time.time() - start) / 60 print; print; print "Total runtime: {0} minutes".format(int(total_run_time)) print; print;