def main(file_path, year, output_path, prev_path, prev5_path):
    start = time.time()
    step = 0
    geo_depths = [2, 4, 6, 7, 8] # state, meso, micro, planning region, munic
    
    d = pd.HDFStore(os.path.abspath(os.path.join(output_path,'rais.h5')))

    step+=1; print; print '''STEP {0}: \nImport file to pandas dataframe'''.format(step)
    rais_df = to_df(file_path, False, calc_d_id=True)

    step+=1; print; print '''STEP {0}: \nAggregate with Demographics'''.format(step)
    tables = aggregate_demographics(rais_df, geo_depths)

    for table_name, table_data in tables.items():
        table_data = add_column_length(table_name, table_data)
        # print table_data.head()
        
    if prev_path:
        step+=1; print; print '''STEP {0}: \nCalculate 1 year growth'''.format(step)
        for t_name, t in tables.items():
            prev_file = os.path.join(prev_path, "{0}.tsv.bz2".format(t_name))
            t_prev = to_df(prev_file, t_name, calc_d_id=True)
            t_prev = t_prev.reset_index(level="year")
            t_prev["year"] = int(year)
            t_prev = t_prev.set_index("year", append=True)
            t_prev = t_prev.reorder_levels(["year"] + list(t_prev.index.names)[:-1])
            
            tables[t_name] = calc_growth(t, t_prev)

            print tables[t_name].head()
            # sys.exit()
            
            if prev5_path:
                step+=1; print; print '''STEP {0}: \nCalculate 5 year growth'''.format(step)
                prev_file = os.path.join(prev5_path, "{0}.tsv.bz2".format(t_name))
                t_prev = to_df(prev_file, t_name, calc_d_id=True)
                t_prev = t_prev.reset_index(level="year")
                t_prev["year"] = int(year)
                t_prev = t_prev.set_index("year", append=True)
                t_prev = t_prev.reorder_levels(["year"] + list(t_prev.index.names)[:-1])
                
                # t_prev = to_df(prev_file, t_name)
                tables[t_name] = calc_growth(tables[t_name], t_prev, 5)
    
    
    print; print '''FINAL STEP: \nSave files to output path'''
    for t_name, t in tables.items():
        if not os.path.exists(output_path):
            os.makedirs(output_path)
        new_file_path = os.path.abspath(os.path.join(output_path, "{0}.tsv.bz2".format(t_name)))
        t.to_csv(bz2.BZ2File(new_file_path, 'wb'), sep="\t", index=True, float_format="%.2f")
    
    print("--- %s minutes ---" % str((time.time() - start)/60))
Exemplo n.º 2
0
def main(cur_path, year, output_path, prev_path, prev5_path, demographic_mode):
    start = time.time()
    step = 0
   
    tables_reg = ["yb", "yi", "yo", "ybi", "ybo", "yio", "ybio"]
    if demographic_mode:
        tables_reg = ["ybd", "yid", "yod", "ybid", "ybod" ]

    for t_name in tables_reg:
        cur_file = os.path.join(cur_path, "{0}.tsv.bz2".format(t_name))
        print cur_file
        t = to_df(cur_file, t_name, calc_d_id=demographic_mode)

    
        step+=1; print; print '''STEP {0}: \nCalculate 1 year growth'''.format(step)
    
        prev_file = os.path.join(prev_path, "{0}.tsv.bz2".format(t_name))
        t_prev = to_df(prev_file, t_name, calc_d_id=demographic_mode)
        t_prev = t_prev.reset_index(level="year")
        t_prev["year"] = int(year)
        t_prev = t_prev.set_index("year", append=True)
        t_prev = t_prev.reorder_levels(["year"] + list(t_prev.index.names)[:-1])
            
        t = calc_growth(t, t_prev)
        print t.head()

        if prev5_path:
            step+=1; print; print '''STEP {0}: \nCalculate 5 year growth'''.format(step)
            prev_file = os.path.join(prev5_path, "{0}.tsv.bz2".format(t_name))
            t_prev = to_df(prev_file, t_name, calc_d_id=demographic_mode)
            t_prev = t_prev.reset_index(level="year")
            t_prev["year"] = int(year)
            t_prev = t_prev.set_index("year", append=True)
            t_prev = t_prev.reorder_levels(["year"] + list(t_prev.index.names)[:-1])
            
            print t_prev.head()

            t = calc_growth(t, t_prev, 5)

            print t.head()

        if not os.path.exists(output_path):
            os.makedirs(output_path)
        print "Saving!"
        new_file_path = os.path.abspath(os.path.join(output_path, "{0}_with_growth.tsv.bz2".format(t_name)))
        t.to_csv(bz2.BZ2File(new_file_path, 'wb'), sep="\t", index=True, float_format="%.2f")
    
    print("--- %s minutes ---" % str((time.time() - start)/60))
Exemplo n.º 3
0
def calc_growth(year, current_year_file_path, prev_year_file_path, prev5_year_file_path):
    tbl_name = extract_tbl_name(current_year_file_path)
    current_year_tbl = to_df(current_year_file_path, tbl_name)
    
    for prev_path, years_ago in [(prev_year_file_path, 1), (prev5_year_file_path, 5)]:
        if not prev_path: continue
        prev_year_tbl = to_df(prev_path, tbl_name)
    
        prev_year_tbl = prev_year_tbl.reset_index(level="year")
        prev_year_tbl["year"] = int(year)
        prev_year_tbl = prev_year_tbl.set_index("year", append=True)
        prev_year_tbl = prev_year_tbl.reorder_levels(["year"] + list(prev_year_tbl.index.names)[:-1])
    
        current_year_tbl = do_growth(current_year_tbl, prev_year_tbl, years_ago)
    
    return (tbl_name, current_year_tbl)
Exemplo n.º 4
0
def main(file_path, year, output_path, prev_path, prev5_path):
    print "\nHEDU YEAR: {0}\n".format(year)
    pre_check()
    output_path = os.path.join(output_path, str(year))

    if not os.path.exists(output_path):
        os.makedirs(output_path)

    hdf_store = pd.HDFStore(os.path.abspath(os.path.join(output_path, 'hedu_data.h5')))

    print '''\nImport file to pandas dataframe'''

    if "hedu_df" in hdf_store:
        hedu_df = hdf_store['hedu_df']
    else:
        hedu_df = to_df(file_path, year)
        try:
            hdf_store['hedu_df'] = hedu_df
        except OverflowError:
            print "WARNING: Unable to save dataframe, Overflow Error."
            hdf_store.close()
            os.remove(os.path.join(output_path, 'hedu_data.h5'))

    tables_list = ["yb", "yu", "yc", "ybc", "ybu", "yuc", "ybuc"]
    index_lookup = {"y": "year", "b": "bra_id", "c": "course_hedu_id", "u": "university_id"}

    for table_name in tables_list:
        indexes = [index_lookup[l] for l in table_name]

        print '''\nAggregating {0}'''.format(table_name)
        aggregated_df = aggregate(indexes, hedu_df)

        print '''Adding length column to {0}'''.format(table_name)
        aggregated_df = add_column_length(table_name, aggregated_df)

        print '''Renaming {0} columns'''.format(table_name)
        aggregated_df.rename(columns={"student_id": "students"}, inplace=True)
        if 'u' not in table_name:
            aggregated_df.rename(columns={"university_id": "num_universities"}, inplace=True)

        if prev_path:
            print '''\nCalculating {0} 1 year growth'''.format(table_name)
            previous_df = open_prev_df(prev_path, table_name, year, indexes)
            aggregated_df = calc_growth(aggregated_df, previous_df, ['enrolled', 'graduates'])

        if prev5_path:
            print '''\nCalculating {0} 5 year growth'''.format(table_name)
            previous_df = open_prev_df(prev5_path, table_name, year, indexes)
            aggregated_df = calc_growth(aggregated_df, previous_df, ['enrolled', 'graduates'], 5)

        if table_name == "ybuc":
            print '''Calculating RCAs'''
            ybc = calc_rca(aggregated_df, year)
            new_file_path = os.path.abspath(os.path.join(output_path, "ybc_rca.tsv.bz2"))
            ybc.to_csv(bz2.BZ2File(new_file_path, 'wb'), sep="\t", index=True)

        file_name = table_name + ".tsv.bz2"
        print '''Save {0} to output path'''.format(file_name)
        new_file_path = os.path.abspath(os.path.join(output_path, file_name))
        aggregated_df.to_csv(bz2.BZ2File(new_file_path, 'wb'), sep="\t", index=True)
Exemplo n.º 5
0
def open_prev_df(prev_path, table_name, year, indexes):
    prev_file = os.path.join(prev_path, "{0}.tsv.bz2".format(table_name))
    previous_df = to_df(prev_file, year, indexes)
    previous_df = previous_df.reset_index(level="year")
    previous_df["year"] = int(year)
    previous_df = previous_df.set_index("year", append=True)
    previous_df = previous_df.reorder_levels(["year"] + list(previous_df.index.names)[:-1])
    return previous_df
def open_prev_df(prev_path, table_name, year, indexes):
    prev_file = os.path.join(prev_path, "{0}.tsv.bz2".format(table_name))
    previous_df = to_df(prev_file, year, indexes)
    previous_df = previous_df.reset_index(level="year")
    previous_df["year"] = int(year)
    previous_df = previous_df.set_index("year", append=True)
    previous_df = previous_df.reorder_levels(
        ["year"] + list(previous_df.index.names)[:-1])
    return previous_df
Exemplo n.º 7
0
def main(file_path, table):

    # This should create the folder if it's not there yet
    if not os.path.exists(file_path): os.makedirs(file_path)

    # Reading the csv, getting a dataframe from pd.read_csv
    data_frame = to_df(sys.argv[1])

    connection = MySQLdb.connect(host=os.environ["DB_HOST"], user=os.environ["DB_USER"], passwd=os.environ["DB_PW"], db=os.environ["DB_NAME"])
    data_frame.to_sql(table, connection, flavor='mysql',  if_exists='replace', index=False, chunksize=650000)
def main(file_path, year, output_path):
    pre_check()
    output_path = os.path.join(output_path, str(year))
    
    if not os.path.exists(output_path): os.makedirs(output_path)
    print "Output Path=", output_path
    # d = pd.HDFStore(os.path.abspath(os.path.join(output_path,'sc_data.h5')))
    print; print '''STEP 1: \nImport file to pandas dataframe'''
    
    hdf_filepath = output_path + "/store_df.h5"

    print "LOOKING for HDF file at location ", hdf_filepath

    if os.path.exists(hdf_filepath):
        print "READING HDF"
        df = pd.read_hdf(hdf_filepath, 'table')
    else:
        print "No HDF file. Need to create DF"
        df = to_df(file_path, False)
        print "SAVING HDF to", hdf_filepath
        df.to_hdf(hdf_filepath, 'table')

    print; print "Step 2: aggregate"

    from _aggregate import agg_rules


    pk_lookup = {"y": "year", "d": "d_id", "b": "bra_id", "c": "course_sc_id", "s": "school_id"}
    
    tables_list = ["yb", "ybd", "yd", "ybs", "yc", "ybc", "ybcd"]

    for table_name in tables_list:
        pk = [pk_lookup[l] for l in table_name]
        print "working on", table_name
        
        dems = ['gender', 'color', 'loc', 'school_type'] if "d" in table_name else ['']
        
        for dem in dems:
            print '''\nSTEP 2: Aggregate {0}'''.format(dem)
            tbl = aggregate(table_name, pk, df, dem)
            
            if "c" in table_name:
                pk2 = [x for x in pk]
                pk2[pk2.index("course_sc_id")] = df.course_sc_id.str.slice(0, 2)
                tbl_course2 = aggregate(table_name, pk2, df, dem, course_flag=True)

                tbl = pd.concat([tbl, tbl_course2])
            
            tbl = add_column_length(table_name, tbl)
            # tbl.rename(columns={"student_id": "students"}, inplace=True)   
            file_name = table_name + "_" + dem + ".tsv.bz2" if "d" in table_name else table_name + ".tsv.bz2"
            print '''Save {0} to output path'''.format(file_name)
            new_file_path = os.path.abspath(os.path.join(output_path, file_name))
            tbl.to_csv(bz2.BZ2File(new_file_path, 'wb'), sep="\t", index=True)
Exemplo n.º 9
0
def calc_growth(year, current_year_file_path, prev_year_file_path,
                prev5_year_file_path):
    tbl_name = extract_tbl_name(current_year_file_path)
    current_year_tbl = to_df(current_year_file_path, tbl_name)

    for prev_path, years_ago in [(prev_year_file_path, 1),
                                 (prev5_year_file_path, 5)]:
        if not prev_path: continue
        prev_year_tbl = to_df(prev_path, tbl_name)

        prev_year_tbl = prev_year_tbl.reset_index(level="year")
        prev_year_tbl["year"] = int(year)
        prev_year_tbl = prev_year_tbl.set_index("year", append=True)
        prev_year_tbl = prev_year_tbl.reorder_levels(
            ["year"] + list(prev_year_tbl.index.names)[:-1])

        current_year_tbl = do_growth(current_year_tbl, prev_year_tbl,
                                     years_ago)

    return (tbl_name, current_year_tbl)
Exemplo n.º 10
0
def main(file_path, output_path):

    # This should create the folder if it's not there yet
    if not os.path.exists(output_path):
        os.makedirs(output_path)

    # Reading the csv, getting a dataframe from pd.read_csv
    data_frame = to_df(sys.argv[1])

    # Filtering data, where Municipality_ID column = 310620
    data_frame = data_frame[(data_frame.Month == 1)]

    # Write output
    new_file_path = os.path.abspath(os.path.join(output_path, "fout.csv"))
    data_frame.to_csv(open(new_file_path, "wb+"), sep=";", index=False, float_format="%.3f", encoding="utf-8")
Exemplo n.º 11
0
def main(file_path, output_path):

    # This should create the folder if it's not there yet
    if not os.path.exists(output_path): os.makedirs(output_path)

    # Reading the csv, getting a dataframe from pd.read_csv
    data_frame = to_df(sys.argv[1])

    # Filtering data, where Municipality_ID column = 310620
    data_frame = data_frame[(data_frame.Month == 1)]

    # Write output
    new_file_path = os.path.abspath(os.path.join(output_path, "fout.csv"))
    data_frame.to_csv(open(new_file_path, 'wb+'),
                      sep=";",
                      index=False,
                      float_format="%.3f",
                      encoding="utf-8")
Exemplo n.º 12
0
def main(file_path, year, output_path, prev_path, prev5_path, requireds_only):

    print; print "~~~**** YEAR: {0} ****~~~".format(year); print;
    start = time.time()
    step = 0
    # regions state, meso, micro, planning region, munic
    depths = {
        "bra": [1, 3, 5, 7, 8, 9],
        "cnae": [1, 3, 6],
        "cbo": [1, 4],
        "demo": [1, 4]
    }

    if file_path:
        if not os.path.exists(output_path): os.makedirs(output_path)
        d = pd.HDFStore(os.path.join(output_path, 'rais_df_raw.h5'))
        if "rais_df" in d:
            rais_df = d['rais_df']
        else:
            step+=1; print; print '''STEP {0}: \nImport file to pandas dataframe'''.format(step)
            rais_df = to_df(file_path, False)
            try:
                d['rais_df'] = rais_df
                # d.close()
            except OverflowError:
                print "WARNING: Unable to save dataframe, Overflow Error."
                d.close()
                os.remove(os.path.join(output_path, 'rais_df_raw.h5'))
        # rais_df = to_df(file_path, False)

        if "yb" in d and not requireds_only:
            tables = {"yb":d["yb"], "yo":d["yo"], "yi":d["yi"], "ybi":d["ybi"], "ybo":d["ybo"], "yio":d["yio"], "ybio":d["ybio"]}
        else:
            step+=1; print; print '''STEP {0}: \nAggregate'''.format(step)
            tables = aggregate(rais_df, depths)

            step+=1; print; print 'STEP {0}: \nImportance'.format(step)
            tables["yio"] = importance(tables["ybio"], tables["ybi"], tables["yio"], tables["yo"], year, depths)

            try:
                d["yb"] = tables["yb"]; d["yo"] =  tables["yo"]; d["yi"] =  tables["yi"]; d["ybi"] = tables["ybi"]; d["ybo"] = tables["ybo"]; d["yio"] = tables["yio"]; d["ybio"] = tables["ybio"]
                d.close()
            except OverflowError:
                print "WARNING: Unable to save dataframe, Overflow Error."
                d.close()
                os.remove(os.path.join(output_path, 'rais_df_raw.h5'))

        step+=1; print; print 'STEP {0}: \nRequired'.format(step)
        [tables["ybi"], tables["ybio"]] = required(tables["ybio"], tables["ybi"], tables["yi"], year, depths, output_path)

        # print tables["ybi"].head()
        # sys.exit()

        step+=1; print; print 'STEP {0}: \nDiversity'.format(step)
        tables["yb"] = calc_diversity(tables["ybi"], tables["yb"], "bra_id", "cnae_id", year, depths)
        tables["yb"] = calc_diversity(tables["ybo"], tables["yb"], "bra_id", "cbo_id", year, depths)
        tables["yi"] = calc_diversity(tables["ybi"], tables["yi"], "cnae_id", "bra_id", year, depths)
        tables["yi"] = calc_diversity(tables["yio"], tables["yi"], "cnae_id", "cbo_id", year, depths)
        tables["yo"] = calc_diversity(tables["ybo"], tables["yo"], "cbo_id", "bra_id", year, depths)
        tables["yo"] = calc_diversity(tables["yio"], tables["yo"], "cbo_id", "cnae_id", year, depths)

        step+=1; print; print 'STEP {0}: \nCalculate RCA, diversity and opportunity gain aka RDO'.format(step)
        tables["ybi"] = rdo(tables["ybi"], tables["yi"], year, depths)

        for table_name, table_data in tables.items():
            table_data = add_column_length(table_name, table_data)

        print; print '''FINAL STEP: \nSave files to output path'''
        for t_name, t in tables.items():
            new_file_path = os.path.abspath(os.path.join(output_path, "{0}.tsv.bz2".format(t_name)))
            t.to_csv(bz2.BZ2File(new_file_path, 'wb'), sep="\t", index=True, float_format="%.3f")

    if prev_path:
        print; print '''Calculating growth:'''
        for current_year_file_path in findFiles(output_path, '*.tsv.bz2'):
            if "growth" in current_year_file_path: continue
            current_year_file_name = os.path.basename(current_year_file_path)
            prev_year_file_path = os.path.join(prev_path, current_year_file_name)
            prev5_year_file_path = None
            if prev5_path:
                prev5_year_file_path = os.path.join(prev5_path, current_year_file_name)
            if not os.path.exists(prev_year_file_path):
                print "Unable to find", current_year_file_name, "for previous year."
                continue
            tbl_name, tbl_w_growth = calc_growth(year, current_year_file_path, prev_year_file_path, prev5_year_file_path)
            print tbl_name
            new_file_path = os.path.abspath(os.path.join(output_path, "{0}_growth.tsv.bz2".format(tbl_name)))
            tbl_w_growth.to_csv(bz2.BZ2File(new_file_path, 'wb'), sep="\t", index=True, float_format="%.3f")
            # os.remove(current_year_file_path)


    print("--- %s minutes ---" % str((time.time() - start)/60))
Exemplo n.º 13
0
def main(file_path, trade_flow, year, eci_file_path, pci_file_path, output_path, prev_path, prev5_path):
    start = time.time()
    step = 0
    
    depths = {
        "bra": [1, 3, 5, 7, 8, 9],
        "hs": [2, 6],
        "wld": [2, 5]
    }
    
    step += 1; print '''\nSTEP {0}: \nImport file to pandas dataframe'''.format(step)
    secex_df = to_df(file_path, False)
    secex_df = secex_df.head(1000)
    sys.exit()

    step += 1; print '''\nSTEP {0}: \nAggregate'''.format(step)
    ybpw = aggregate(secex_df)

    step += 1; print '''\nSTEP {0}: \nShard'''.format(step)
    [yb, ybp, ybw, yp, ypw, yw] = shard(ybpw, depths)

    if trade_flow == "export":
        step += 1; print '''\nSTEP {0}: \nCalculate PCI & ECI'''.format(step)
        [yp, yw] = pci_wld_eci(eci_file_path, pci_file_path, yp, yw)

        step += 1; print '''\nSTEP {0}: \nCalculate domestic ECI'''.format(step)
        yb = domestic_eci(yp, yb, ybp, depths)

    step += 1; print '''\nSTEP {0}: \nCalculate diversity'''.format(step)
    yb = calc_diversity(ybp, yb, "bra_id", "hs_id", depths)
    yb = calc_diversity(ybw, yb, "bra_id", "wld_id", depths)
    yp = calc_diversity(ybp, yp, "hs_id", "bra_id", depths)
    yp = calc_diversity(ypw, yp, "hs_id", "wld_id", depths)
    yw = calc_diversity(ybw, yw, "wld_id", "bra_id", depths)
    yw = calc_diversity(ypw, yw, "wld_id", "hs_id", depths)

    if trade_flow == "export":
        step += 1; print '''\nSTEP {0}: \nCalculate Brazilian RCA'''.format(step)
        yp = brazil_rca(yp, year)
    
    if trade_flow == "export":
        step += 1; print '''\nSTEP {0}: \nCalculate RCA, diversity and opp_gain aka RDO'''.format(step)
        ybp = rdo(ybp, yp, year, depths)
    if trade_flow == "import":
        step += 1; print '''\nSTEP {0}: \nCalculate RCD calculation'''.format(step)
        ybp = rcd(ybp, yp, year, depths)
    
    # print ybp.head(20)
    # sys.exit()
    
    tables = {"yb": yb, "yp": yp, "yw": yw, "ybp": ybp, "ybpw": ybpw, "ybw": ybw, "ypw": ypw}
    
    if prev_path:
        step += 1; print '''\nSTEP {0}: \nCalculate 1 year growth'''.format(step)
        if prev5_path:
            step += 1; print '''\nSTEP {0}: \nCalculate 5 year growth'''.format(step)
        for t_name, t in tables.items():
            prev_file = os.path.join(prev_path, "{0}.tsv.bz2".format(t_name))
            t_prev = to_df(prev_file, t_name)
            t_prev = t_prev.reset_index(level="year")
            t_prev["year"] = int(year)
            t_prev = t_prev.set_index("year", append=True)
            t_prev = t_prev.reorder_levels(["year"] + list(t_prev.index.names)[:-1])
            
            t = calc_growth(t, t_prev)
            
            if prev5_path:
                prev_file = os.path.join(prev5_path, "{0}.tsv.bz2".format(t_name))
                t_prev = to_df(prev_file, t_name)
                t_prev = t_prev.reset_index(level="year")
                t_prev["year"] = int(year)
                t_prev = t_prev.set_index("year", append=True)
                t_prev = t_prev.reorder_levels(["year"] + list(t_prev.index.names)[:-1])
                
                t = calc_growth(t, t_prev, 5)

    print "computing column lengths"
    for table_name, table_data in tables.items():
        tables[table_name] = add_column_length(table_name, table_data)

    print '''\nFINAL STEP: \nSave files to output path'''
    for t_name, t in tables.items():
        if not os.path.exists(output_path):
            os.makedirs(output_path)
        new_file_path = os.path.abspath(os.path.join(output_path, "{0}.tsv.bz2".format(t_name)))
        t.to_csv(bz2.BZ2File(new_file_path, 'wb'), sep="\t", index=True)
    
    total_run_time = (time.time() - start) / 60
    print; print;
    print "Total runtime: {0} minutes".format(int(total_run_time))
    print; print;
def main(file_path, year, output_path):
    pre_check()
    output_path = os.path.join(output_path, str(year))

    print "\nYEAR: {0}\n".format(year)
    this_output_path = os.path.join(output_path)
    if not os.path.exists(this_output_path): os.makedirs(this_output_path)
    
    step = 0
    step+=1; print '''STEP {0}: Import file to pandas dataframe'''.format(step)
    df = to_df(file_path, year)
    
    tables_list = ["yb", "ybd", "yd", "ybc", "yc", "ybu", "ybcd", "yu", "yuc", "yucd", "yud"]
    pk_lookup = {"y": "year", "d": "d_id", "b": "bra_id", "c": "course_hedu_id", "u": "university_id"}

    ybuc = None

    for table_name in tables_list:
        pk = [pk_lookup[l] for l in table_name]
        print "working on", table_name
        
        dems = ['gender', 'ethnicity', 'school_type'] if "d" in table_name else ['']
        
        for dem in dems:
            print '''\nSTEP 2: Aggregate {0}'''.format(dem)
            tbl = aggregate(pk, df, dem)
            
            if "c" in table_name:
                pk2 = [x for x in pk]
                pk2[pk2.index("course_hedu_id")] = df.course_hedu_id.str.slice(0, 2)
                # df2.course_hedu_id = df.course_hedu_id.str.slice(0, 2)
                tbl_course2 = aggregate(pk2, df, dem)

                tbl = pd.concat([tbl, tbl_course2])
            
            tbl = add_column_length(table_name, tbl)
            tbl.rename(columns={"student_id": "students"}, inplace=True)   
            if table_name == "yb":
                tbl.rename(columns={"university_id": "num_universities"}, inplace=True)   
            if table_name == "ybuc":
                print tbl.head()
                ybuc = tbl
            file_name = table_name + "_" + dem + ".tsv.bz2" if "d" in table_name else table_name + ".tsv.bz2"
            print '''Save {0} to output path'''.format(file_name)
            new_file_path = os.path.abspath(os.path.join(output_path, file_name))
            tbl.to_csv(bz2.BZ2File(new_file_path, 'wb'), sep="\t", index=True)

        # if "c" in table_name:
        #     print '''\nSTEP 3: Aggregate {0}'''
        #     tbl = aggregate(pk, df, '', 2)
        #     tbl = add_column_length(table_name, tbl)
        #     # print tbl.reset_index().course_hedu_id.nunique()
        #     file_name = table_name + "_cid2.tsv.bz2"
        #     print '''Save {0} to output path'''.format(file_name)
        #     new_file_path = os.path.abspath(os.path.join(output_path, file_name))
        #     tbl.to_csv(bz2.BZ2File(new_file_path, 'wb'), sep="\t", index=True)        
    
    if ybuc is not None:
        step+=1; print '''STEP {0}: Calculating RCAs'''.format(step)
        ybc = calc_rca(ybuc, year)
        new_file_path = os.path.abspath(os.path.join(output_path, "ybc_rca.tsv.bz2"))
        ybc.to_csv(bz2.BZ2File(new_file_path, 'wb'), sep="\t", index=True)
        print "writing", new_file_path
Exemplo n.º 15
0
def main(year, output_path, attr_type):

    if "-" in year:
        years = range(int(year.split('-')[0]), int(year.split('-')[1]) + 1)
    else:
        years = [int(year)]
    print("years:", str(years))

    start = time.time()
    for year in years:
        d = pd.HDFStore(os.path.join(output_path, str(year), 'rais_df_raw.h5'))
        if "rais_df" in d:
            rais_df = d['rais_df']
        else:
            file_path = os.path.join(output_path, 'Rais_{}.csv'.format(year))
            rais_df = to_df(file_path)
            d['rais_df'] = rais_df

        hist_bins = pd.HDFStore(
            os.path.join(output_path, '{}_hist_bins.h5'.format(attr_type)))

        for depth in depths_lookup[attr_type]:
            print("\n{} depth: {}\n".format(attr_type, depth))

            this_depth_df = rais_df.copy()
            this_depth_df['{}_id'.format(attr_type)] = this_depth_df[
                '{}_id'.format(attr_type)].str.slice(0, depth)

            # uniqs = ['1112', '8401', '8202', '7842', '7621']:
            uniqs = this_depth_df["{}_id".format(attr_type)].unique()
            for i, id in enumerate(uniqs):
                this_id_df = this_depth_df[this_depth_df['{}_id'.format(
                    attr_type)] == id]
                if len(this_id_df.index) < 10:
                    print("\nNot enough occurences for histogram")
                    continue
                print("********* {}: {} ({}/{}) *********".format(
                    year, id, i + 1, len(uniqs)),
                      end='\r')
                sys.stdout.flush()

                if int(year) == latest_year:
                    wage = this_id_df["wage"]
                    wmin = rounddown(wage.mean() - (wage.std() * 2))
                    wmin = 0 if wmin < 0 else wmin
                    wmax = rounddown(wage.mean() + (wage.std() * 2))
                    wrange = wmax - wmin
                    # print wrange
                    bin_size = 100
                    if wrange > 3000:
                        bin_size = 200
                    if wrange > 5000:
                        bin_size = 500
                    if wrange > 10000:
                        bin_size = 1000
                    ''' !!! exception for regions (all need to have same bins!) !!! '''
                    if attr_type == "bra" and depth == 1:
                        bin_size = 200
                        wmin = 0
                        wmax = 5200

                    hist_bins["{}_{}".format(attr_type, id)] = pd.Series(
                        [wmin, wmax, bin_size])
                else:
                    if "{}_{}".format(attr_type, id) in hist_bins:
                        wmin, wmax, bin_size = hist_bins["{}_{}".format(
                            attr_type, id)]
                    else:
                        continue

                hist(id, this_id_df, wmin, wmax, bin_size, attr_type, year)

        d.close()
        hist_bins.close()
    time_elapsed = "%s minutes" % str((time.time() - start) / 60)

    print('''\nTotal time %s''' % time_elapsed)
    print('''\nSending alert e-mail''')

    client = sendgrid.SendGridClient(os.environ['SENDGRID_API_KEY'])
    message = sendgrid.Mail()

    message.add_to(os.environ.get('ADMIN_EMAIL', '*****@*****.**'))
    message.set_from("*****@*****.**")
    message.set_subject("Rais histogram for %s ready!" % year)
    message.set_html(
        "Your calculation took %s, please check out the output at the calc-server"
        % time_elapsed)

    client.send(message)
def main(file_path, year, output_path, prev_path, prev5_path):
    start = time.time()
    step = 0
    geo_depths = [2, 4, 6, 7, 8]  # state, meso, micro, planning region, munic

    d = pd.HDFStore(os.path.abspath(os.path.join(output_path, 'rais.h5')))

    step += 1
    print
    print '''STEP {0}: \nImport file to pandas dataframe'''.format(step)
    rais_df = to_df(file_path, False, calc_d_id=True)

    step += 1
    print
    print '''STEP {0}: \nAggregate with Demographics'''.format(step)
    tables = aggregate_demographics(rais_df, geo_depths)

    for table_name, table_data in tables.items():
        table_data = add_column_length(table_name, table_data)
        # print table_data.head()

    if prev_path:
        step += 1
        print
        print '''STEP {0}: \nCalculate 1 year growth'''.format(step)
        for t_name, t in tables.items():
            prev_file = os.path.join(prev_path, "{0}.tsv.bz2".format(t_name))
            t_prev = to_df(prev_file, t_name, calc_d_id=True)
            t_prev = t_prev.reset_index(level="year")
            t_prev["year"] = int(year)
            t_prev = t_prev.set_index("year", append=True)
            t_prev = t_prev.reorder_levels(["year"] +
                                           list(t_prev.index.names)[:-1])

            tables[t_name] = calc_growth(t, t_prev)

            print tables[t_name].head()
            # sys.exit()

            if prev5_path:
                step += 1
                print
                print '''STEP {0}: \nCalculate 5 year growth'''.format(step)
                prev_file = os.path.join(prev5_path,
                                         "{0}.tsv.bz2".format(t_name))
                t_prev = to_df(prev_file, t_name, calc_d_id=True)
                t_prev = t_prev.reset_index(level="year")
                t_prev["year"] = int(year)
                t_prev = t_prev.set_index("year", append=True)
                t_prev = t_prev.reorder_levels(["year"] +
                                               list(t_prev.index.names)[:-1])

                # t_prev = to_df(prev_file, t_name)
                tables[t_name] = calc_growth(tables[t_name], t_prev, 5)

    print
    print '''FINAL STEP: \nSave files to output path'''
    for t_name, t in tables.items():
        if not os.path.exists(output_path):
            os.makedirs(output_path)
        new_file_path = os.path.abspath(
            os.path.join(output_path, "{0}.tsv.bz2".format(t_name)))
        t.to_csv(bz2.BZ2File(new_file_path, 'wb'),
                 sep="\t",
                 index=True,
                 float_format="%.2f")

    print("--- %s minutes ---" % str((time.time() - start) / 60))
Exemplo n.º 17
0
import sys
from _to_df import to_df

if len(sys.argv) < 2:
    sys.exit("Usage: print.py csv_file.csv")

df = to_df(sys.argv[1])

print df.head()
def main(file_path, year, output_path, prev_path, prev5_path):
    print "\nHEDU YEAR: {0}\n".format(year)
    pre_check()
    output_path = os.path.join(output_path, str(year))

    if not os.path.exists(output_path):
        os.makedirs(output_path)

    hdf_store = pd.HDFStore(
        os.path.abspath(os.path.join(output_path, 'hedu_data.h5')))

    print '''\nImport file to pandas dataframe'''

    if "hedu_df" in hdf_store:
        hedu_df = hdf_store['hedu_df']
    else:
        hedu_df = to_df(file_path, year)
        try:
            hdf_store['hedu_df'] = hedu_df
        except OverflowError:
            print "WARNING: Unable to save dataframe, Overflow Error."
            hdf_store.close()
            os.remove(os.path.join(output_path, 'hedu_data.h5'))

    tables_list = ["yb", "yu", "yc", "ybc", "ybu", "yuc", "ybuc"]
    index_lookup = {
        "y": "year",
        "b": "bra_id",
        "c": "course_hedu_id",
        "u": "university_id"
    }

    for table_name in tables_list:
        indexes = [index_lookup[l] for l in table_name]

        print '''\nAggregating {0}'''.format(table_name)
        aggregated_df = aggregate(indexes, hedu_df)

        print '''Adding length column to {0}'''.format(table_name)
        aggregated_df = add_column_length(table_name, aggregated_df)

        print '''Renaming {0} columns'''.format(table_name)
        aggregated_df.rename(columns={"student_id": "students"}, inplace=True)
        if 'u' not in table_name:
            aggregated_df.rename(columns={"university_id": "num_universities"},
                                 inplace=True)

        if prev_path:
            print '''\nCalculating {0} 1 year growth'''.format(table_name)
            previous_df = open_prev_df(prev_path, table_name, year, indexes)
            aggregated_df = calc_growth(aggregated_df, previous_df,
                                        ['enrolled', 'graduates'])

        if prev5_path:
            print '''\nCalculating {0} 5 year growth'''.format(table_name)
            previous_df = open_prev_df(prev5_path, table_name, year, indexes)
            aggregated_df = calc_growth(aggregated_df, previous_df,
                                        ['enrolled', 'graduates'], 5)

        if table_name == "ybuc":
            print '''Calculating RCAs'''
            ybc = calc_rca(aggregated_df, year)
            new_file_path = os.path.abspath(
                os.path.join(output_path, "ybc_rca.tsv.bz2"))
            ybc.to_csv(bz2.BZ2File(new_file_path, 'wb'), sep="\t", index=True)

        file_name = table_name + ".tsv.bz2"
        print '''Save {0} to output path'''.format(file_name)
        new_file_path = os.path.abspath(os.path.join(output_path, file_name))
        aggregated_df.to_csv(bz2.BZ2File(new_file_path, 'wb'),
                             sep="\t",
                             index=True)
Exemplo n.º 19
0
def main(file_path, year, output_path, prev_path, prev5_path):
    print "\nSC YEAR: {0}\n".format(year)
    start = time.time()
    pre_check()
    output_path = os.path.join(output_path, str(year))

    if not os.path.exists(output_path):
        os.makedirs(output_path)

    hdf_store = pd.HDFStore(
        os.path.abspath(os.path.join(output_path, 'sc_data.h5')))

    print '''\nImport file to pandas dataframe'''

    if "sc_df" in hdf_store:
        sc_df = hdf_store['sc_df']
    else:
        sc_df = to_df(file_path)
        try:
            hdf_store['sc_df'] = sc_df
        except OverflowError:
            print "WARNING: Unable to save dataframe, Overflow Error."
            hdf_store.close()
            os.remove(os.path.join(output_path, 'sc_data.h5'))

    tables_list = ["yb", "yc", "ys", "ybs", "ybc", "ysc", "ybsc"]
    index_lookup = {"y": "year", "b": "bra_id", "c": "course_sc_id", "s": "school_id"}

    for table_name in tables_list:
        indexes = [index_lookup[l] for l in table_name]

        print '''\nAggregating {0}'''.format(table_name)
        aggregated_df = aggregate(indexes, sc_df)

        print '''Adding length column to {0}'''.format(table_name)
        aggregated_df = add_column_length(table_name, aggregated_df)

        print '''Renaming {0} columns'''.format(table_name)
        aggregated_df.rename(columns={"enroll_id": "enrolled"}, inplace=True)
        aggregated_df.rename(columns={"class_id": "classes"}, inplace=True)
        if 's' not in table_name:
            aggregated_df.rename(columns={"school_id": "num_schools"}, inplace=True)

        if prev_path:
            print '''\nCalculating {0} 1 year growth'''.format(table_name)
            previous_df = open_prev_df(prev_path, table_name, year, indexes)
            aggregated_df = calc_growth(aggregated_df, previous_df, ['enrolled'])

        if prev5_path:
            print '''\nCalculating {0} 5 year growth'''.format(table_name)
            previous_df = open_prev_df(prev5_path, table_name, year, indexes)
            aggregated_df = calc_growth(aggregated_df, previous_df, ['enrolled'], 5)

        file_name = table_name + ".tsv.bz2"
        print '''\nSave {0} to output path'''.format(file_name)
        new_file_path = os.path.abspath(os.path.join(output_path, file_name))
        aggregated_df.to_csv(bz2.BZ2File(new_file_path, 'wb'), sep="\t", index=True)

    time_elapsed = "%s minutes" % str((time.time() - start) / 60)

    print '''\nTotal time %s''' % time_elapsed
    print '''\nSending alert e-mail'''

    client = sendgrid.SendGridClient(os.environ['SENDGRID_API_KEY'])
    message = sendgrid.Mail()

    message.add_to(os.environ.get('ADMIN_EMAIL', '*****@*****.**'))
    message.set_from("*****@*****.**")
    message.set_subject("Scholar census %s ready!" % year)
    message.set_html("Your calculation took %s, please check out the output at the calc-server" % time_elapsed)

    client.send(message)
Exemplo n.º 20
0
def main(file_path, year, output_path):
    pre_check()
    output_path = os.path.join(output_path, str(year))

    print "\nYEAR: {0}\n".format(year)
    this_output_path = os.path.join(output_path)
    if not os.path.exists(this_output_path): os.makedirs(this_output_path)

    step = 0
    step += 1
    print '''STEP {0}: Import file to pandas dataframe'''.format(step)
    df = to_df(file_path, year)

    tables_list = [
        "yb", "ybd", "yd", "ybc", "yc", "ybu", "ybcd", "yu", "yuc", "yucd",
        "yud"
    ]
    pk_lookup = {
        "y": "year",
        "d": "d_id",
        "b": "bra_id",
        "c": "course_hedu_id",
        "u": "university_id"
    }

    ybuc = None

    for table_name in tables_list:
        pk = [pk_lookup[l] for l in table_name]
        print "working on", table_name

        dems = ['gender', 'ethnicity', 'school_type'
                ] if "d" in table_name else ['']

        for dem in dems:
            print '''\nSTEP 2: Aggregate {0}'''.format(dem)
            tbl = aggregate(pk, df, dem)

            if "c" in table_name:
                pk2 = [x for x in pk]
                pk2[pk2.index("course_hedu_id")] = df.course_hedu_id.str.slice(
                    0, 2)
                # df2.course_hedu_id = df.course_hedu_id.str.slice(0, 2)
                tbl_course2 = aggregate(pk2, df, dem)

                tbl = pd.concat([tbl, tbl_course2])

            tbl = add_column_length(table_name, tbl)
            tbl.rename(columns={"student_id": "students"}, inplace=True)
            if table_name == "yb":
                tbl.rename(columns={"university_id": "num_universities"},
                           inplace=True)
            if table_name == "ybuc":
                print tbl.head()
                ybuc = tbl
            file_name = table_name + "_" + dem + ".tsv.bz2" if "d" in table_name else table_name + ".tsv.bz2"
            print '''Save {0} to output path'''.format(file_name)
            new_file_path = os.path.abspath(
                os.path.join(output_path, file_name))
            tbl.to_csv(bz2.BZ2File(new_file_path, 'wb'), sep="\t", index=True)

        # if "c" in table_name:
        #     print '''\nSTEP 3: Aggregate {0}'''
        #     tbl = aggregate(pk, df, '', 2)
        #     tbl = add_column_length(table_name, tbl)
        #     # print tbl.reset_index().course_hedu_id.nunique()
        #     file_name = table_name + "_cid2.tsv.bz2"
        #     print '''Save {0} to output path'''.format(file_name)
        #     new_file_path = os.path.abspath(os.path.join(output_path, file_name))
        #     tbl.to_csv(bz2.BZ2File(new_file_path, 'wb'), sep="\t", index=True)

    if ybuc is not None:
        step += 1
        print '''STEP {0}: Calculating RCAs'''.format(step)
        ybc = calc_rca(ybuc, year)
        new_file_path = os.path.abspath(
            os.path.join(output_path, "ybc_rca.tsv.bz2"))
        ybc.to_csv(bz2.BZ2File(new_file_path, 'wb'), sep="\t", index=True)
        print "writing", new_file_path
def main(file_path, year, output_path, prev_path, prev5_path):
    print "\nSC YEAR: {0}\n".format(year)
    start = time.time()
    pre_check()
    output_path = os.path.join(output_path, str(year))

    if not os.path.exists(output_path):
        os.makedirs(output_path)

    hdf_store = pd.HDFStore(
        os.path.abspath(os.path.join(output_path, 'sc_data.h5')))

    print '''\nImport file to pandas dataframe'''

    if "sc_df" in hdf_store:
        sc_df = hdf_store['sc_df']
    else:
        sc_df = to_df(file_path)
        try:
            hdf_store['sc_df'] = sc_df
        except OverflowError:
            print "WARNING: Unable to save dataframe, Overflow Error."
            hdf_store.close()
            os.remove(os.path.join(output_path, 'sc_data.h5'))

    tables_list = ["yb", "yc", "ys", "ybs", "ybc", "ysc", "ybsc"]
    index_lookup = {
        "y": "year",
        "b": "bra_id",
        "c": "course_sc_id",
        "s": "school_id"
    }

    for table_name in tables_list:
        indexes = [index_lookup[l] for l in table_name]

        print '''\nAggregating {0}'''.format(table_name)
        aggregated_df = aggregate(indexes, sc_df)

        print '''Adding length column to {0}'''.format(table_name)
        aggregated_df = add_column_length(table_name, aggregated_df)

        print '''Renaming {0} columns'''.format(table_name)
        aggregated_df.rename(columns={"enroll_id": "enrolled"}, inplace=True)
        aggregated_df.rename(columns={"class_id": "classes"}, inplace=True)
        if 's' not in table_name:
            aggregated_df.rename(columns={"school_id": "num_schools"},
                                 inplace=True)

        if prev_path:
            print '''\nCalculating {0} 1 year growth'''.format(table_name)
            previous_df = open_prev_df(prev_path, table_name, year, indexes)
            aggregated_df = calc_growth(aggregated_df, previous_df,
                                        ['enrolled'])

        if prev5_path:
            print '''\nCalculating {0} 5 year growth'''.format(table_name)
            previous_df = open_prev_df(prev5_path, table_name, year, indexes)
            aggregated_df = calc_growth(aggregated_df, previous_df,
                                        ['enrolled'], 5)

        file_name = table_name + ".tsv.bz2"
        print '''\nSave {0} to output path'''.format(file_name)
        new_file_path = os.path.abspath(os.path.join(output_path, file_name))
        aggregated_df.to_csv(bz2.BZ2File(new_file_path, 'wb'),
                             sep="\t",
                             index=True)

    time_elapsed = "%s minutes" % str((time.time() - start) / 60)

    print '''\nTotal time %s''' % time_elapsed
    print '''\nSending alert e-mail'''

    client = sendgrid.SendGridClient(os.environ['SENDGRID_API_KEY'])
    message = sendgrid.Mail()

    message.add_to(os.environ.get('ADMIN_EMAIL', '*****@*****.**'))
    message.set_from("*****@*****.**")
    message.set_subject("Scholar census %s ready!" % year)
    message.set_html(
        "Your calculation took %s, please check out the output at the calc-server"
        % time_elapsed)

    client.send(message)
Exemplo n.º 22
0
def main(export_file_path, import_file_path, year, eci_file_path,
         pci_file_path, ypw_file_path, output_path, prev_path, prev5_path):
    output_path = os.path.join(output_path, str(year))
    start = time.time()
    step = 0

    depths = {"bra": [1, 3, 5, 7, 9], "hs": [2, 6], "wld": [2, 5]}

    if not os.path.exists(output_path):
        os.makedirs(output_path)
    d = pd.HDFStore(os.path.join(output_path, 'secex.h5'))
    # if "ymb" in d:
    if "ymbp" in d:
        tables = {}
        tables["ymb"] = d["ymb"]
        tables["ymp"] = d["ymp"]
        tables["ymw"] = d["ymw"]
        tables["ymbp"] = d["ymbp"]
        tables["ymbw"] = d["ymbw"]
        tables["ympw"] = d["ympw"]
        tables["ymbpw"] = d["ymbpw"]
    else:
        step += 1
        print '''\nSTEP {0}: \nImport file to pandas dataframe'''.format(step)
        secex_exports = to_df(export_file_path, False)
        secex_imports = to_df(import_file_path, False)

        step += 1
        print '''\nSTEP {0}: \nMerge imports and exports'''.format(step)
        secex_df = merge(secex_exports, secex_imports)

        step += 1
        print '''\nSTEP {0}: \nAggregate'''.format(step)
        ymbpw = aggregate(secex_df)

        step += 1
        print '''\nSTEP {0}: \nShard'''.format(step)
        [ymb, ymbp, ymbw, ymp, ympw, ymw] = shard(ymbpw)

        step += 1
        print '''\nSTEP {0}: \nCalculate PCI & ECI'''.format(step)
        [ymp, ymw] = pci_wld_eci(eci_file_path, pci_file_path, ymp, ymw, year)

        step += 1
        print '''\nSTEP {0}: \nCalculate diversity'''.format(step)
        ymb = calc_diversity(ymbp, ymb, "bra_id", "hs_id")
        ymb = calc_diversity(ymbw, ymb, "bra_id", "wld_id")
        ymp = calc_diversity(ymbp, ymp, "hs_id", "bra_id")
        ymp = calc_diversity(ympw, ymp, "hs_id", "wld_id")
        ymw = calc_diversity(ymbw, ymw, "wld_id", "bra_id")
        ymw = calc_diversity(ympw, ymw, "wld_id", "hs_id")

        step += 1
        print '''\nSTEP {0}: \nCalculate domestic ECI'''.format(step)
        ymb = domestic_eci(ymp, ymb, ymbp, depths["bra"])

        step += 1
        print '''\nSTEP {0}: \nCalculate domestic ECI'''.format(step)
        ymb = domestic_eci(ymp, ymb, ymbp, depths["bra"])

        step += 1
        print '''\nSTEP {0}: \nCalculate Brazilian RCA'''.format(step)
        ymp = brazil_rca(ymp, ypw_file_path, year)

        step += 1
        print '''\nSTEP {0}: \nCalculate RCA, diversity and opp_gain aka RDO'''.format(
            step)
        ymbp = rdo(ymbp, ymp, year, depths["bra"], ypw_file_path)

        tables = {
            "ymb": ymb,
            "ymp": ymp,
            "ymw": ymw,
            "ymbp": ymbp,
            "ymbpw": ymbpw,
            "ymbw": ymbw,
            "ympw": ympw
        }
        for tbln, tbl in tables.items():
            d[tbln] = tbl

    if prev_path:
        step += 1
        print '''\nSTEP {0}: \nCalculate 1 year growth'''.format(step)
        if prev5_path:
            step += 1
            print '''\nSTEP {0}: \nCalculate 5 year growth'''.format(step)
        for t_name, t in tables.items():
            print t_name
            prev_file = os.path.join(prev_path, "{0}.tsv.bz2".format(t_name))
            t_prev = to_df(prev_file, t_name)
            t_prev = t_prev.reset_index(level="year")
            t_prev["year"] = int(year)
            t_prev = t_prev.set_index("year", append=True)
            t_prev = t_prev.reorder_levels(["year"] +
                                           list(t_prev.index.names)[:-1])

            t = calc_growth(t, t_prev)

            if prev5_path:
                prev_file = os.path.join(prev5_path,
                                         "{0}.tsv.bz2".format(t_name))
                t_prev = to_df(prev_file, t_name)
                t_prev = t_prev.reset_index(level="year")
                t_prev["year"] = int(year)
                t_prev = t_prev.set_index("year", append=True)
                t_prev = t_prev.reorder_levels(["year"] +
                                               list(t_prev.index.names)[:-1])

                t = calc_growth(t, t_prev, 5)

    print "computing column lengths"
    for table_name, table_data in tables.items():
        tables[table_name] = add_column_length(table_name, table_data)

    print '''\nFINAL STEP: \nSave files to output path'''
    for t_name, t in tables.items():
        if not os.path.exists(output_path):
            os.makedirs(output_path)
        new_file_path = os.path.abspath(
            os.path.join(output_path, "{0}.tsv.bz2".format(t_name)))
        t.to_csv(bz2.BZ2File(new_file_path, 'wb'), sep="\t", index=True)

    total_run_time = (time.time() - start) / 60
    print
    print
    print "Total runtime: {0} minutes".format(int(total_run_time))
    print
    print
def main(file_path, year, output_path, prev_path, prev5_path, requireds_only):

    print
    print "~~~**** YEAR: {0} ****~~~".format(year)
    print
    start = time.time()
    step = 0
    # regions state, meso, micro, planning region, munic
    depths = {
        "bra": [1, 3, 5, 7, 8, 9],
        "cnae": [1, 3, 6],
        "cbo": [1, 4],
        "demo": [1, 4]
    }

    if file_path:
        if not os.path.exists(output_path): os.makedirs(output_path)
        d = pd.HDFStore(os.path.join(output_path, 'rais_df_raw.h5'))
        if "rais_df" in d:
            rais_df = d['rais_df']
        else:
            step += 1
            print
            print '''STEP {0}: \nImport file to pandas dataframe'''.format(
                step)
            rais_df = to_df(file_path, False)
            try:
                d['rais_df'] = rais_df
                # d.close()
            except OverflowError:
                print "WARNING: Unable to save dataframe, Overflow Error."
                d.close()
                os.remove(os.path.join(output_path, 'rais_df_raw.h5'))
        # rais_df = to_df(file_path, False)

        if "yb" in d and not requireds_only:
            tables = {
                "yb": d["yb"],
                "yo": d["yo"],
                "yi": d["yi"],
                "ybi": d["ybi"],
                "ybo": d["ybo"],
                "yio": d["yio"],
                "ybio": d["ybio"]
            }
        else:
            step += 1
            print
            print '''STEP {0}: \nAggregate'''.format(step)
            tables = aggregate(rais_df, depths)

            step += 1
            print
            print 'STEP {0}: \nImportance'.format(step)
            tables["yio"] = importance(tables["ybio"], tables["ybi"],
                                       tables["yio"], tables["yo"], year,
                                       depths)

            try:
                d["yb"] = tables["yb"]
                d["yo"] = tables["yo"]
                d["yi"] = tables["yi"]
                d["ybi"] = tables["ybi"]
                d["ybo"] = tables["ybo"]
                d["yio"] = tables["yio"]
                d["ybio"] = tables["ybio"]
                d.close()
            except OverflowError:
                print "WARNING: Unable to save dataframe, Overflow Error."
                d.close()
                os.remove(os.path.join(output_path, 'rais_df_raw.h5'))

        step += 1
        print
        print 'STEP {0}: \nRequired'.format(step)
        [tables["ybi"],
         tables["ybio"]] = required(tables["ybio"], tables["ybi"],
                                    tables["yi"], year, depths, output_path)

        # print tables["ybi"].head()
        # sys.exit()

        step += 1
        print
        print 'STEP {0}: \nDiversity'.format(step)
        tables["yb"] = calc_diversity(tables["ybi"], tables["yb"], "bra_id",
                                      "cnae_id", year, depths)
        tables["yb"] = calc_diversity(tables["ybo"], tables["yb"], "bra_id",
                                      "cbo_id", year, depths)
        tables["yi"] = calc_diversity(tables["ybi"], tables["yi"], "cnae_id",
                                      "bra_id", year, depths)
        tables["yi"] = calc_diversity(tables["yio"], tables["yi"], "cnae_id",
                                      "cbo_id", year, depths)
        tables["yo"] = calc_diversity(tables["ybo"], tables["yo"], "cbo_id",
                                      "bra_id", year, depths)
        tables["yo"] = calc_diversity(tables["yio"], tables["yo"], "cbo_id",
                                      "cnae_id", year, depths)

        step += 1
        print
        print 'STEP {0}: \nCalculate RCA, diversity and opportunity gain aka RDO'.format(
            step)
        tables["ybi"] = rdo(tables["ybi"], tables["yi"], year, depths)

        for table_name, table_data in tables.items():
            table_data = add_column_length(table_name, table_data)

        print
        print '''FINAL STEP: \nSave files to output path'''
        for t_name, t in tables.items():
            new_file_path = os.path.abspath(
                os.path.join(output_path, "{0}.tsv.bz2".format(t_name)))
            t.to_csv(bz2.BZ2File(new_file_path, 'wb'),
                     sep="\t",
                     index=True,
                     float_format="%.3f")

    if prev_path:
        print
        print '''Calculating growth:'''
        for current_year_file_path in findFiles(output_path, '*.tsv.bz2'):
            if "growth" in current_year_file_path: continue
            current_year_file_name = os.path.basename(current_year_file_path)
            prev_year_file_path = os.path.join(prev_path,
                                               current_year_file_name)
            prev5_year_file_path = None
            if prev5_path:
                prev5_year_file_path = os.path.join(prev5_path,
                                                    current_year_file_name)
            if not os.path.exists(prev_year_file_path):
                print "Unable to find", current_year_file_name, "for previous year."
                continue
            tbl_name, tbl_w_growth = calc_growth(year, current_year_file_path,
                                                 prev_year_file_path,
                                                 prev5_year_file_path)
            print tbl_name
            new_file_path = os.path.abspath(
                os.path.join(output_path,
                             "{0}_growth.tsv.bz2".format(tbl_name)))
            tbl_w_growth.to_csv(bz2.BZ2File(new_file_path, 'wb'),
                                sep="\t",
                                index=True,
                                float_format="%.3f")
            # os.remove(current_year_file_path)

    print("--- %s minutes ---" % str((time.time() - start) / 60))
Exemplo n.º 24
0
def main(output_path, start_from):
    start = time.time()
    
    print "Reading rais_df_raw.h5..."
    if not os.path.exists(output_path): os.makedirs(output_path)
    d = pd.HDFStore(os.path.join(output_path, 'rais_df_raw.h5'))
    if "rais_df" in d:
        rais_df = d['rais_df']
    else:
        file_path = raw_input('No rais_df_raw.h5 found, raw file path: ')
        rais_df = to_df(file_path, False)
    d.close()
    
    year = int(rais_df["year"].unique().tolist()[0])
    rais_df = rais_df.drop(["year", "age", "color", "gender", "literacy", "d_id"], axis=1)
    
    cnaes = set(rais_df.cnae_id.unique())
    cnaes = cnaes.union({c[:3] for c in cnaes})
    cnaes = cnaes.union({c[:1] for c in cnaes})
    cnaes = list(cnaes)
    cnaes.sort()
    
    if start_from:
        cnaes = cnaes[cnaes.index(start_from):]
    
    last_seen = {d:{"id":None, "data":None} for d in depths["cnae"][:-1]}
    for i, cnae in enumerate(cnaes):
        s = time.time()
        
        prev_id_len = depths["cnae"].index(len(cnae))-1
        prev_id_len = depths["cnae"][prev_id_len]
        if prev_id_len in last_seen and cnae[:prev_id_len] == last_seen[prev_id_len]["id"]:
            cnae_df = last_seen[prev_id_len]["data"][last_seen[prev_id_len]["data"].cnae_id.str.startswith(cnae)]
        else:
            cnae_df = rais_df[rais_df.cnae_id.str.startswith(cnae)]
        
        if len(cnae) in last_seen:
            last_seen[len(cnae)]["id"] = cnae
            last_seen[len(cnae)]["data"] = cnae_df.copy()
        
        cnae_df = cnae_df[cnae_df.cnae_id.str.startswith(cnae)]
        if len(cnae) < 6:
            cnae_df["cnae_id"] = cnae_df["cnae_id"].str.slice(0, len(cnae))
        
        cnae_df = cnae_df[cnae_df["cnae_id"]==cnae]
        
        # cnae_df['est_id'] = cnae_df['est_id'].str.cat(cnae_df['est_size'].values.astype(str), sep='_')
        # cnae_df = cnae_df.drop(["est_size"], axis=1)
        
        ests = cnae_df.groupby(["est_size", "est_id"]).agg({"wage":pd.Series.median})
        bounds_lower = ests.groupby(level=["est_size"]).agg({"wage":lambda x: x.quantile(0.25)})
        bounds_upper = ests.groupby(level=["est_size"]).agg({"wage":lambda x: x.quantile(0.75)})
        ests = ests.reset_index(level=["est_id"])
        ests["lower"] = bounds_lower["wage"]
        ests["upper"] = bounds_upper["wage"]
        # print ests.shape
        ests = ests[(ests["wage"]>=ests["lower"]) & (ests["wage"]<=ests["upper"])]
        # print ests.shape
        # print ests["est_id"].count(), ests["est_id"].nunique()

        # cnae_df["est_qualify"] = cnae_df["est_id"].apply(lambda x: x in ests["est_id"].tolist())
        # cnae_df = cnae_df[cnae_df["est_qualify"]]
        cnae_df = cnae_df[cnae_df["est_id"].isin(ests["est_id"])]
        
        if cnae_df.empty: continue
        
        num_ests = ests.groupby(level=0).agg({"est_id":pd.Series.count})
        
        # cbos = cnae_df.groupby(["est_size", "cbo_id"]).agg({"est_id":lambda x: set.union(set(x)), "num_emp":pd.Series.count, "wage":pd.Series.median})
        cbos = cnae_df.groupby(["est_size", "cbo_id"]).agg({"est_id": pd.Series.nunique, "num_emp":pd.Series.count, "wage":pd.Series.median})
        cbos = cbos.reset_index(level=["cbo_id"])
        # cbos['test'] = cbos.est_id.str.len()
        cbos['num_est'] = num_ests["est_id"]
        cbos['qualify'] = cbos["est_id"] / cbos["num_est"]
        
        cbos = cbos[cbos['qualify'] >= 0.2]
        cbos["ene"] = cbos["num_emp"] / cbos["est_id"]
        cbos = cbos.reset_index().set_index(["cbo_id", "est_size"])
        # print cbos["ene"].head()
        
        ene = cbos["ene"].unstack(level=-1)
        ene = ene.rename(columns={0:'ene_micro', 1:'ene_small', 2:'ene_medium', 3:'ene_large'})
        ene["cnae_id"] = cnae
        ene["year"] = year
        
        ene = ene.set_index(["year", "cnae_id"], append=True)
        ene = ene.reorder_levels(["year", "cnae_id", "cbo_id"])
        
        print cnae, (time.time() - s) / 60
        
        fname_yio = os.path.join(output_path, 'ene.csv')
        if i == 0 and not start_from:
            ene.to_csv(fname_yio)
        else:
            ene.to_csv(open(fname_yio, 'a'), header=False)
        
    
    print "Done! Merging..."
    
    index_col = ["year", "cnae_id", "cbo_id"]
    full_tbl = pd.read_csv(os.path.join(output_path, "yio.tsv.bz2"), sep="\t", compression="bz2", converters={"cbo_id":str, "cnae_id":str, "year":int})
    full_tbl = full_tbl.set_index(index_col)
    ene_tbl = pd.read_csv(os.path.join(output_path, "ene.csv"), converters={"cbo_id":str, "cnae_id":str, "year":int})
    ene_tbl = ene_tbl.set_index(index_col)
    
    full_tbl = full_tbl.join(ene_tbl, how='outer')
    
    new_file_path = os.path.abspath(os.path.join(output_path, "yio_ene.tsv.bz2"))
    full_tbl.to_csv(bz2.BZ2File(new_file_path, 'wb'), sep="\t", index=True, float_format="%.2f")
    
    print("--- %s minutes ---" % str((time.time() - start)/60))
def main(file_path, trade_flow, year, eci_file_path, pci_file_path,
         output_path, prev_path, prev5_path):
    start = time.time()
    step = 0

    depths = {"bra": [1, 3, 5, 7, 8, 9], "hs": [2, 6], "wld": [2, 5]}

    step += 1
    print '''\nSTEP {0}: \nImport file to pandas dataframe'''.format(step)
    secex_df = to_df(file_path, False)
    secex_df = secex_df.head(1000)
    sys.exit()

    step += 1
    print '''\nSTEP {0}: \nAggregate'''.format(step)
    ybpw = aggregate(secex_df)

    step += 1
    print '''\nSTEP {0}: \nShard'''.format(step)
    [yb, ybp, ybw, yp, ypw, yw] = shard(ybpw, depths)

    if trade_flow == "export":
        step += 1
        print '''\nSTEP {0}: \nCalculate PCI & ECI'''.format(step)
        [yp, yw] = pci_wld_eci(eci_file_path, pci_file_path, yp, yw)

        step += 1
        print '''\nSTEP {0}: \nCalculate domestic ECI'''.format(step)
        yb = domestic_eci(yp, yb, ybp, depths)

    step += 1
    print '''\nSTEP {0}: \nCalculate diversity'''.format(step)
    yb = calc_diversity(ybp, yb, "bra_id", "hs_id", depths)
    yb = calc_diversity(ybw, yb, "bra_id", "wld_id", depths)
    yp = calc_diversity(ybp, yp, "hs_id", "bra_id", depths)
    yp = calc_diversity(ypw, yp, "hs_id", "wld_id", depths)
    yw = calc_diversity(ybw, yw, "wld_id", "bra_id", depths)
    yw = calc_diversity(ypw, yw, "wld_id", "hs_id", depths)

    if trade_flow == "export":
        step += 1
        print '''\nSTEP {0}: \nCalculate Brazilian RCA'''.format(step)
        yp = brazil_rca(yp, year)

    if trade_flow == "export":
        step += 1
        print '''\nSTEP {0}: \nCalculate RCA, diversity and opp_gain aka RDO'''.format(
            step)
        ybp = rdo(ybp, yp, year, depths)
    if trade_flow == "import":
        step += 1
        print '''\nSTEP {0}: \nCalculate RCD calculation'''.format(step)
        ybp = rcd(ybp, yp, year, depths)

    # print ybp.head(20)
    # sys.exit()

    tables = {
        "yb": yb,
        "yp": yp,
        "yw": yw,
        "ybp": ybp,
        "ybpw": ybpw,
        "ybw": ybw,
        "ypw": ypw
    }

    if prev_path:
        step += 1
        print '''\nSTEP {0}: \nCalculate 1 year growth'''.format(step)
        if prev5_path:
            step += 1
            print '''\nSTEP {0}: \nCalculate 5 year growth'''.format(step)
        for t_name, t in tables.items():
            prev_file = os.path.join(prev_path, "{0}.tsv.bz2".format(t_name))
            t_prev = to_df(prev_file, t_name)
            t_prev = t_prev.reset_index(level="year")
            t_prev["year"] = int(year)
            t_prev = t_prev.set_index("year", append=True)
            t_prev = t_prev.reorder_levels(["year"] +
                                           list(t_prev.index.names)[:-1])

            t = calc_growth(t, t_prev)

            if prev5_path:
                prev_file = os.path.join(prev5_path,
                                         "{0}.tsv.bz2".format(t_name))
                t_prev = to_df(prev_file, t_name)
                t_prev = t_prev.reset_index(level="year")
                t_prev["year"] = int(year)
                t_prev = t_prev.set_index("year", append=True)
                t_prev = t_prev.reorder_levels(["year"] +
                                               list(t_prev.index.names)[:-1])

                t = calc_growth(t, t_prev, 5)

    print "computing column lengths"
    for table_name, table_data in tables.items():
        tables[table_name] = add_column_length(table_name, table_data)

    print '''\nFINAL STEP: \nSave files to output path'''
    for t_name, t in tables.items():
        if not os.path.exists(output_path):
            os.makedirs(output_path)
        new_file_path = os.path.abspath(
            os.path.join(output_path, "{0}.tsv.bz2".format(t_name)))
        t.to_csv(bz2.BZ2File(new_file_path, 'wb'), sep="\t", index=True)

    total_run_time = (time.time() - start) / 60
    print
    print
    print "Total runtime: {0} minutes".format(int(total_run_time))
    print
    print
def main(export_file_path, import_file_path, year, eci_file_path, pci_file_path, ypw_file_path, output_path):
    start = time.time()
    step = 0
    
    depths = {
        "bra": [1, 3, 5, 7, 9],
        "hs": [2, 6],
        "wld": [2, 5]
    }
    
    if not os.path.exists(output_path): os.makedirs(output_path)
    d = pd.HDFStore(os.path.join(output_path, 'secex.h5'))
    # if "ymb" in d:
    if "ymbp" in d:
        tables = {}
        tables["ymb"] = d["ymb"]; tables["ymp"] = d["ymp"]; tables["ymw"] = d["ymw"]; tables["ymbp"] = d["ymbp"]; tables["ymbw"] = d["ymbw"]; tables["ympw"] = d["ympw"]; tables["ymbpw"] = d["ymbpw"]
    else:
        step += 1; print '''\nSTEP {0}: \nImport file to pandas dataframe'''.format(step)
        secex_exports = to_df(export_file_path, False)
        secex_imports = to_df(import_file_path, False)
        # secex_exports = secex_exports.head(1000)
        # secex_imports = secex_imports.head(1000)

        step += 1; print '''\nSTEP {0}: \nMerge imports and exports'''.format(step)
        secex_df = merge(secex_exports, secex_imports)

        step += 1; print '''\nSTEP {0}: \nAggregate'''.format(step)
        ymbpw = aggregate(secex_df)

        step += 1; print '''\nSTEP {0}: \nShard'''.format(step)
        [ymb, ymbp, ymbw, ymp, ympw, ymw] = shard(ymbpw)

        step += 1; print '''\nSTEP {0}: \nCalculate PCI & ECI'''.format(step)
        [ymp, ymw] = pci_wld_eci(eci_file_path, pci_file_path, ymp, ymw, year)

        step += 1; print '''\nSTEP {0}: \nCalculate diversity'''.format(step)
        ymb = calc_diversity(ymbp, ymb, "bra_id", "hs_id")
        ymb = calc_diversity(ymbw, ymb, "bra_id", "wld_id")
        ymp = calc_diversity(ymbp, ymp, "hs_id", "bra_id")
        ymp = calc_diversity(ympw, ymp, "hs_id", "wld_id")
        ymw = calc_diversity(ymbw, ymw, "wld_id", "bra_id")
        ymw = calc_diversity(ympw, ymw, "wld_id", "hs_id")
        
        step += 1; print '''\nSTEP {0}: \nCalculate domestic ECI'''.format(step)
        ymb = domestic_eci(ymp, ymb, ymbp, depths["bra"])

        step += 1; print '''\nSTEP {0}: \nCalculate Brazilian RCA'''.format(step)
        ymp = brazil_rca(ymp, ypw_file_path, year)
    
        step += 1; print '''\nSTEP {0}: \nCalculate RCA, diversity and opp_gain aka RDO'''.format(step)
        ymbp = rdo(ymbp, ymp, year, depths["bra"], ypw_file_path)
        
        tables = {"ymb": ymb, "ymp": ymp, "ymw": ymw, "ymbp": ymbp, "ymbpw": ymbpw, "ymbw": ymbw, "ympw": ympw}
        for tbln, tbl in tables.items():
            d[tbln] = tbl

    print "computing column lengths"
    for table_name, table_data in tables.items():
        tables[table_name] = add_column_length(table_name, table_data)
    
    print '''\nFINAL STEP: \nSave files to output path'''
    for t_name, t in tables.items():
        if not os.path.exists(output_path):
            os.makedirs(output_path)
        new_file_path = os.path.abspath(os.path.join(output_path, "{0}.tsv.bz2".format(t_name)))
        t.to_csv(bz2.BZ2File(new_file_path, 'wb'), sep="\t", index=True)

    total_run_time = (time.time() - start) / 60
    print; print;
    print "Total runtime: {0} minutes".format(int(total_run_time))
    print; print;