def main(first_year_str, second_year_str, cols, output_path, edu=None, years=1, strcasts=""):
    start = time.time()
    step = 0
    
    if strcasts:
        strcastlist = strcasts.split(",")
        converters = {x:str for x in strcastlist}
    else:
        converters = {}
    
    step+=1; print; print '''STEP {0}: \nCalculate 1 year growth'''.format(step)
    
    orig_path = get_file(first_year_str)
    df1 = pd.read_csv(orig_path, sep="\t", converters=converters)
    new_path = get_file(second_year_str)
    df2 = pd.read_csv(new_path, sep="\t", converters=converters)

    col_names = cols.split(",")
    print "CALCULATING growth for the following columns:", col_names
    t_name = parse_table_name(second_year_str) 

    df2 = do_growth(t_name, df2, df1, col_names, years, edu)
    
    
    print "GOT TABLE NAME OF ", t_name
    if not t_name:
        t_name = "noname"
    new_file_path = os.path.abspath(os.path.join(output_path, "{0}.tsv.bz2".format(t_name)))
    df2.to_csv(bz2.BZ2File(new_file_path, 'wb'), sep="\t", index=False, float_format="%.4f")
    
    print("--- %s minutes ---" % str((time.time() - start)/60))
示例#2
0
def updateMDICxIBGE():
    '''Open CSV file'''
    raw_file_path = os.path.abspath(os.path.join(DATA_DIR, 'secex', 'MDICxIBGE.csv'))
    raw_file = get_file(raw_file_path)
    delim = ";"
    csv_reader = csv.reader(raw_file, delimiter=delim)
    for i, line in enumerate(csv_reader):
        #7 = IBGE , 9 = MDIC
        ibge_cod=line[7].strip()
        mdic_cod=line[9].strip()
        if i==0:
            continue

        sql="select id_mdic, id from attrs_bra where id_ibge = {0}".format(ibge_cod)
        cursor.execute(sql)
        values=cursor.fetchall()
        size=len(values)
        if size==1:          
            old_mdic=str(values[0][0])
            old_idbra=str(values[0][1])
            if old_mdic<>mdic_cod:
                print "Changing MDIC code from {0} to {1} in IBGE cod {2}".format(old_mdic,mdic_cod,ibge_cod)
                sql="select * from secex_yb where bra_id='{0}'".format(old_idbra)
                cursor.execute(sql)
                if len(cursor.fetchall())>0:
                    print "Exports found for {0}".format(ibge_cod)
                sql="update attrs_bra set id_mdic = {0} where id_ibge = {1}".format(mdic_cod,ibge_cod)
                #cursor.execute(sql)
        else:
            print "Error finding a IBGE {0}: Found {1}".format(ibge_cod,size)
示例#3
0
def to_df(input_file_path, separator=";", index=False):
    input_file = get_file(input_file_path)
    s = time.time()

    df = pandas.read_csv(input_file, sep=separator, engine='c', decimal=',', encoding='utf-8')

    print (time.time() - s) / 60.0, "minutes to read."

    return df
示例#4
0
def main(first_year_str,
         second_year_str,
         cols,
         output_path,
         edu=None,
         years=1,
         strcasts=""):
    start = time.time()
    step = 0

    if strcasts:
        strcastlist = strcasts.split(",")
        converters = {x: str for x in strcastlist}
    else:
        converters = {}

    step += 1
    print
    print '''STEP {0}: \nCalculate 1 year growth'''.format(step)

    orig_path = get_file(first_year_str)
    df1 = pd.read_csv(orig_path, sep="\t", converters=converters)
    new_path = get_file(second_year_str)
    df2 = pd.read_csv(new_path, sep="\t", converters=converters)

    col_names = cols.split(",")
    print "CALCULATING growth for the following columns:", col_names
    t_name = parse_table_name(second_year_str)

    df2 = do_growth(t_name, df2, df1, col_names, years, edu)

    print "GOT TABLE NAME OF ", t_name
    if not t_name:
        t_name = "noname"
    new_file_path = os.path.abspath(
        os.path.join(output_path, "{0}.tsv.bz2".format(t_name)))
    df2.to_csv(bz2.BZ2File(new_file_path, 'wb'),
               sep="\t",
               index=False,
               float_format="%.4f")

    print("--- %s minutes ---" % str((time.time() - start) / 60))
示例#5
0
def to_df(input_file_path, separator=";", index=False):
    input_file = get_file(input_file_path)
    s = time.time()

    df = pandas.read_csv(input_file,
                         sep=separator,
                         engine='c',
                         decimal=',',
                         encoding='utf-8')

    print(time.time() - s) / 60.0, "minutes to read."

    return df
示例#6
0
def load_model(model_path=MODEL, weights_path=WEIGHTS):
    global _model
    print("load model")
    if _model is None:
        try:
            json_file = open(model_path, 'r')
            loaded_model_json = json_file.read()
            json_file.close()
            _model = model_from_json(loaded_model_json)
            weights = helpers.get_file(weights_path)
            _model.set_weights(weights)
            _model.compile(optimizer="adam",
                           loss='categorical_crossentropy',
                           metrics=['top_k_categorical_accuracy'])
            print("Loaded model from disk")
        except IOError:
            _model = None
    return _model
示例#7
0
def to_df(input_file_path, index=False):
    input_file = get_file(input_file_path)
    s = time.time()

    if index:
        index_lookup = {"y":"year", "b":"bra_id", "i":"cnae_id", "o":"cbo_id", "d": "d_id"}
        index_cols = [index_lookup[i] for i in index]
        rais_df = pd.read_csv(input_file, sep="\t", converters={"bra_id":str, "cbo_id":str, "cnae_id":str})
        rais_df = rais_df.set_index(index_cols)
    else:

        orig_cols = ['BrazilianOcupation_ID', 'EconomicAtivity_ID_CNAE', 'Literacy', 'Age', 'Establishment_ID', 'Simple', 'Municipality_ID', 'Employee_ID', 'Color', 'WageReceived', 'AverageMonthlyWage', 'Gender', 'Establishment_Size', 'Year', 'Establishment_ID_len']
        cols = ["cbo_id", "cnae_id", "literacy", "age", "est_id", "simple", "bra_id", "num_emp", "color", "wage_dec", "wage", "gender", "est_size", "year"]
        delim = ";"
        coerce_cols = {"bra_id": bra_replace, "cnae_id":cnae_replace, "cbo_id":cbo_replace, \
                        "emp_id":str, "est_id": str, "age": convertint}
        rais_df = pd.read_csv(input_file, header=0, sep=delim, names=cols, converters=coerce_cols, engine='c', decimal=',')
        rais_df = rais_df[["year", "bra_id", "cnae_id", "cbo_id", "wage", "num_emp", "est_id", "age"]]

        print "first remove rows with empty ages, if any..."
        count = rais_df[ rais_df.age == -999 ].age.count()
        if count > 0:
            print "** REMOVED", count, "rows due to empty ages"
        rais_df = rais_df[ rais_df.age != -999 ]

        print "finding missing attrs..."
        for col, missings in missing.items():
            if not len(missings): continue

            num_rows = rais_df.shape[0]
            print; print "[WARNING]"; print "The following {0} IDs are not in the DB and will be dropped from the data.".format(col);
            print list(missings)
            rais_df = rais_df.dropna(subset=[col])
            print; print "{0} rows deleted.".format(num_rows - rais_df.shape[0]); print;

        print (time.time() - s) / 60.0, "minutes to read."

    return rais_df
示例#8
0
def to_df(input_file_path, index=False):
    input_file = get_file(input_file_path)
    s = time.time()
    
    if index:
        index_lookup = {"y":"year", "b":"bra_id", "i":"cnae_id", "o":"cbo_id", "d": "d_id"}
        index_cols = [index_lookup[i] for i in index]
        rais_df = pd.read_csv(input_file, sep="\t", converters={"bra_id":str, "cbo_id":str, "cnae_id":str})
        rais_df = rais_df.set_index(index_cols)
    else:
        orig_cols = ['BrazilianOcupation_ID', 'EconomicAtivity_ID_CNAE', 'Literacy', 'Age', 'Establishment_ID', 'Simple', 'Municipality_ID', 'Employee_ID', 'Color', 'WageReceived', 'AverageMonthlyWage', 'Gender', 'Establishment_Size', 'Year', 'Establishment_ID_len']
        cols = ["cbo_id", "cnae_id", "literacy", "age", "est_id", "simple", "bra_id", "num_emp", "color", "wage_dec", "wage", "gender", "est_size", "year"]
        delim = ";"
        coerce_cols = {"bra_id": bra_replace, "cnae_id":cnae_replace, "cbo_id":cbo_replace, \
                        "emp_id":str, "est_id": str, "age": convertint}
        rais_df = pd.read_csv(input_file, header=0, sep=delim, names=cols, converters=coerce_cols, engine='c', decimal=',')
        rais_df = rais_df[["year", "bra_id", "cnae_id", "cbo_id", "wage", "num_emp", "est_id", "age", "color", "gender", "est_size", "literacy"]]

        print "first remove rows with empty ages, if any..."
        count = rais_df[ rais_df.age == -999 ].age.count()
        if count > 0:
            print "** REMOVED", count, "rows due to empty ages"
        rais_df = rais_df[ rais_df.age != -999 ]
        
        print "finding missing attrs..."
        for col, missings in missing.items():
            if not len(missings): continue
            num_rows = rais_df.shape[0]
            print; print "[WARNING]"; print "The following {0} IDs are not in the DB and will be dropped from the data.".format(col);
            print list(missings)
            # drop_criterion = rais_df[col].map(lambda x: x not in vals)
            # rais_df = rais_df[drop_criterion]
            rais_df = rais_df.dropna(subset=[col])
            print; print "{0} rows deleted.".format(num_rows - rais_df.shape[0]); print;

        print "generating demographic codes..."
        FEMALE, MALE = 0, 1
        gender_dict = {MALE: 'A', FEMALE: 'B'}
        rais_df["gender"] = rais_df["gender"].replace(gender_dict)
        
        INDIAN, WHITE, BLACK, ASIAN, MULTI, UNKNOWN = 1,2,4,6,8,9
        color_dict = {INDIAN:'C', WHITE:'D', BLACK:'E', ASIAN:'F', MULTI:'G', UNKNOWN:'H', -1:'H' }
        rais_df["color"] = rais_df["color"].replace(color_dict)
        
        lit_dict = {1:'I', 2:'I', 3:'J', 4:'J', 5:'J', 6:'J', 7:'K', 8:'K', 9:'L', -1:'M'}
        rais_df["literacy"] = rais_df["literacy"].replace(lit_dict)
        
        rais_df["age_demo"] = (rais_df["age"] / 10).astype(int)
        rais_df["age_demo"] = rais_df["age_demo"].clip(1,6)
        
        rais_df["d_id"] = rais_df['gender'].str.cat([rais_df['age_demo'].values.astype(str), rais_df['color'].values.astype(str), rais_df['literacy'].values.astype(str)])

        rais_df = rais_df.drop(["gender", "color", "age_demo", "literacy"], axis=1)
                
        # rais_df["new_est_size"] = rais_df["cnae_id"].str.slice(1, 3).astype(int)
        # rais_df["new_est_size"][rais_df["new_est_size"].between(5, 35)] = -1
        # rais_df["new_est_size"][rais_df["new_est_size"] >= 0] = 0
        #
        # print rais_df["new_est_size"].mask(rais_df["new_est_size"] >= 0).head()
        # print rais_df["new_est_size"].where(rais_df["new_est_size"] >= 0).head()
        
        print "determining establishment sizes..."
        rais_df["new_est_size_1"] = rais_df["cnae_id"].str.slice(1, 3).astype(int)
        rais_df.loc[rais_df["new_est_size_1"].between(5, 35),"new_est_size_1"] = -1
        rais_df.loc[rais_df["new_est_size_1"] >= 0,"new_est_size_1"] = 0
        
        rais_df["new_est_size_2"] = rais_df["new_est_size_1"].mask(rais_df["new_est_size_1"] >= 0).head()
        rais_df["new_est_size_1"] = rais_df["new_est_size_1"].where(rais_df["new_est_size_1"] >= 0).head()

        rais_df.loc[rais_df["new_est_size_2"]==-1,"new_est_size_2"] = 0
        rais_df["new_est_size_2"] = rais_df["new_est_size_2"] + rais_df["est_size"]
        rais_df["new_est_size_1"] = rais_df["new_est_size_1"] + rais_df["est_size"]
        
        est_size_1_lookup = {1:0, 2:0, 3:1, 4:1, 5:2, 6:3, 7:3, 8:3, 9:3}
        est_size_2_lookup = {1:0, 2:0, 3:0, 4:1, 5:1, 6:2, 7:2, 8:3, 9:3}
        
        rais_df["new_est_size_1"] = rais_df["new_est_size_1"].replace(est_size_1_lookup)
        rais_df["new_est_size_2"] = rais_df["new_est_size_2"].replace(est_size_2_lookup)
        
        rais_df["est_size"] = rais_df["new_est_size_1"].fillna(0) + rais_df["new_est_size_2"].fillna(0)
        
        rais_df = rais_df.drop(["new_est_size_1", "new_est_size_2"], axis=1)
        
        print (time.time() - s) / 60.0, "minutes to read."

    return rais_df
示例#9
0
def to_df(input_file_path, index=False):
    input_file = get_file(input_file_path)
    s = time.time()

    if index:
        index_lookup = {
            "y": "year",
            "b": "bra_id",
            "i": "cnae_id",
            "o": "cbo_id",
            "d": "d_id"
        }
        index_cols = [index_lookup[i] for i in index]
        rais_df = pd.read_csv(input_file,
                              sep="\t",
                              converters={
                                  "bra_id": str,
                                  "cbo_id": str,
                                  "cnae_id": str
                              })
        rais_df = rais_df.set_index(index_cols)
    else:

        orig_cols = [
            'BrazilianOcupation_ID', 'EconomicAtivity_ID_CNAE', 'Literacy',
            'Age', 'Establishment_ID', 'Simple', 'Municipality_ID',
            'Employee_ID', 'Color', 'WageReceived', 'AverageMonthlyWage',
            'Gender', 'Establishment_Size', 'Year', 'Establishment_ID_len'
        ]
        cols = [
            "cbo_id", "cnae_id", "literacy", "age", "est_id", "simple",
            "bra_id", "num_emp", "color", "wage_dec", "wage", "gender",
            "est_size", "year"
        ]
        delim = ";"
        coerce_cols = {"bra_id": bra_replace, "cnae_id":cnae_replace, "cbo_id":cbo_replace, \
                        "emp_id":str, "est_id": str, "age": convertint}
        rais_df = pd.read_csv(input_file,
                              header=0,
                              sep=delim,
                              names=cols,
                              converters=coerce_cols,
                              engine='c',
                              decimal=',')
        rais_df = rais_df[[
            "year", "bra_id", "cnae_id", "cbo_id", "wage", "num_emp", "est_id",
            "age"
        ]]

        print "first remove rows with empty ages, if any..."
        count = rais_df[rais_df.age == -999].age.count()
        if count > 0:
            print "** REMOVED", count, "rows due to empty ages"
        rais_df = rais_df[rais_df.age != -999]

        print "finding missing attrs..."
        for col, missings in missing.items():
            if not len(missings): continue

            num_rows = rais_df.shape[0]
            print
            print "[WARNING]"
            print "The following {0} IDs are not in the DB and will be dropped from the data.".format(
                col)
            print list(missings)
            rais_df = rais_df.dropna(subset=[col])
            print
            print "{0} rows deleted.".format(num_rows - rais_df.shape[0])
            print

        print(time.time() - s) / 60.0, "minutes to read."

    return rais_df
示例#10
0
def to_df(input_file_path, index=False):
    input_file = get_file(input_file_path)
    s = time.time()

    if index:
        index_lookup = {
            "y": "year",
            "b": "bra_id",
            "i": "cnae_id",
            "o": "cbo_id",
            "d": "d_id"
        }
        index_cols = [index_lookup[i] for i in index]
        rais_df = pd.read_csv(input_file,
                              sep="\t",
                              converters={
                                  "bra_id": str,
                                  "cbo_id": str,
                                  "cnae_id": str
                              })
        rais_df = rais_df.set_index(index_cols)
    else:
        orig_cols = [
            'BrazilianOcupation_ID', 'EconomicAtivity_ID_CNAE', 'Literacy',
            'Age', 'Establishment_ID', 'Simple', 'Municipality_ID',
            'Employee_ID', 'Color', 'WageReceived', 'AverageMonthlyWage',
            'Gender', 'Establishment_Size', 'Year', 'Establishment_ID_len'
        ]
        cols = [
            "cbo_id", "cnae_id", "literacy", "age", "est_id", "simple",
            "bra_id", "num_emp", "color", "wage_dec", "wage", "gender",
            "est_size", "year"
        ]
        delim = ";"
        coerce_cols = {"bra_id": bra_replace, "cnae_id":cnae_replace, "cbo_id":cbo_replace, \
                        "emp_id":str, "est_id": str, "age": convertint}
        rais_df = pd.read_csv(input_file,
                              header=0,
                              sep=delim,
                              names=cols,
                              converters=coerce_cols,
                              engine='c',
                              decimal=',')
        rais_df = rais_df[[
            "year", "bra_id", "cnae_id", "cbo_id", "wage", "num_emp", "est_id",
            "age", "color", "gender", "est_size", "literacy"
        ]]

        print "first remove rows with empty ages, if any..."
        count = rais_df[rais_df.age == -999].age.count()
        if count > 0:
            print "** REMOVED", count, "rows due to empty ages"
        rais_df = rais_df[rais_df.age != -999]

        print "finding missing attrs..."
        for col, missings in missing.items():
            if not len(missings): continue
            num_rows = rais_df.shape[0]
            print
            print "[WARNING]"
            print "The following {0} IDs are not in the DB and will be dropped from the data.".format(
                col)
            print list(missings)
            # drop_criterion = rais_df[col].map(lambda x: x not in vals)
            # rais_df = rais_df[drop_criterion]
            rais_df = rais_df.dropna(subset=[col])
            print
            print "{0} rows deleted.".format(num_rows - rais_df.shape[0])
            print

        print "generating demographic codes..."
        FEMALE, MALE = 0, 1
        gender_dict = {MALE: 'A', FEMALE: 'B'}
        rais_df["gender"] = rais_df["gender"].replace(gender_dict)

        INDIAN, WHITE, BLACK, ASIAN, MULTI, UNKNOWN = 1, 2, 4, 6, 8, 9
        color_dict = {
            INDIAN: 'C',
            WHITE: 'D',
            BLACK: 'E',
            ASIAN: 'F',
            MULTI: 'G',
            UNKNOWN: 'H',
            -1: 'H'
        }
        rais_df["color"] = rais_df["color"].replace(color_dict)

        lit_dict = {
            1: 'I',
            2: 'I',
            3: 'J',
            4: 'J',
            5: 'J',
            6: 'J',
            7: 'K',
            8: 'K',
            9: 'L',
            -1: 'M'
        }
        rais_df["literacy"] = rais_df["literacy"].replace(lit_dict)

        rais_df["age_demo"] = (rais_df["age"] / 10).astype(int)
        rais_df["age_demo"] = rais_df["age_demo"].clip(1, 6)

        rais_df["d_id"] = rais_df['gender'].str.cat([
            rais_df['age_demo'].values.astype(str),
            rais_df['color'].values.astype(str),
            rais_df['literacy'].values.astype(str)
        ])

        rais_df = rais_df.drop(["gender", "color", "age_demo", "literacy"],
                               axis=1)

        # rais_df["new_est_size"] = rais_df["cnae_id"].str.slice(1, 3).astype(int)
        # rais_df["new_est_size"][rais_df["new_est_size"].between(5, 35)] = -1
        # rais_df["new_est_size"][rais_df["new_est_size"] >= 0] = 0
        #
        # print rais_df["new_est_size"].mask(rais_df["new_est_size"] >= 0).head()
        # print rais_df["new_est_size"].where(rais_df["new_est_size"] >= 0).head()

        print "determining establishment sizes..."
        rais_df["new_est_size_1"] = rais_df["cnae_id"].str.slice(1,
                                                                 3).astype(int)
        rais_df.loc[rais_df["new_est_size_1"].between(5, 35),
                    "new_est_size_1"] = -1
        rais_df.loc[rais_df["new_est_size_1"] >= 0, "new_est_size_1"] = 0

        rais_df["new_est_size_2"] = rais_df["new_est_size_1"].mask(
            rais_df["new_est_size_1"] >= 0).head()
        rais_df["new_est_size_1"] = rais_df["new_est_size_1"].where(
            rais_df["new_est_size_1"] >= 0).head()

        rais_df.loc[rais_df["new_est_size_2"] == -1, "new_est_size_2"] = 0
        rais_df[
            "new_est_size_2"] = rais_df["new_est_size_2"] + rais_df["est_size"]
        rais_df[
            "new_est_size_1"] = rais_df["new_est_size_1"] + rais_df["est_size"]

        est_size_1_lookup = {
            1: 0,
            2: 0,
            3: 1,
            4: 1,
            5: 2,
            6: 3,
            7: 3,
            8: 3,
            9: 3
        }
        est_size_2_lookup = {
            1: 0,
            2: 0,
            3: 0,
            4: 1,
            5: 1,
            6: 2,
            7: 2,
            8: 3,
            9: 3
        }

        rais_df["new_est_size_1"] = rais_df["new_est_size_1"].replace(
            est_size_1_lookup)
        rais_df["new_est_size_2"] = rais_df["new_est_size_2"].replace(
            est_size_2_lookup)

        rais_df["est_size"] = rais_df["new_est_size_1"].fillna(
            0) + rais_df["new_est_size_2"].fillna(0)

        rais_df = rais_df.drop(["new_est_size_1", "new_est_size_2"], axis=1)

        print(time.time() - s) / 60.0, "minutes to read."

    return rais_df