def main(first_year_str, second_year_str, cols, output_path, edu=None, years=1, strcasts=""): start = time.time() step = 0 if strcasts: strcastlist = strcasts.split(",") converters = {x:str for x in strcastlist} else: converters = {} step+=1; print; print '''STEP {0}: \nCalculate 1 year growth'''.format(step) orig_path = get_file(first_year_str) df1 = pd.read_csv(orig_path, sep="\t", converters=converters) new_path = get_file(second_year_str) df2 = pd.read_csv(new_path, sep="\t", converters=converters) col_names = cols.split(",") print "CALCULATING growth for the following columns:", col_names t_name = parse_table_name(second_year_str) df2 = do_growth(t_name, df2, df1, col_names, years, edu) print "GOT TABLE NAME OF ", t_name if not t_name: t_name = "noname" new_file_path = os.path.abspath(os.path.join(output_path, "{0}.tsv.bz2".format(t_name))) df2.to_csv(bz2.BZ2File(new_file_path, 'wb'), sep="\t", index=False, float_format="%.4f") print("--- %s minutes ---" % str((time.time() - start)/60))
def updateMDICxIBGE(): '''Open CSV file''' raw_file_path = os.path.abspath(os.path.join(DATA_DIR, 'secex', 'MDICxIBGE.csv')) raw_file = get_file(raw_file_path) delim = ";" csv_reader = csv.reader(raw_file, delimiter=delim) for i, line in enumerate(csv_reader): #7 = IBGE , 9 = MDIC ibge_cod=line[7].strip() mdic_cod=line[9].strip() if i==0: continue sql="select id_mdic, id from attrs_bra where id_ibge = {0}".format(ibge_cod) cursor.execute(sql) values=cursor.fetchall() size=len(values) if size==1: old_mdic=str(values[0][0]) old_idbra=str(values[0][1]) if old_mdic<>mdic_cod: print "Changing MDIC code from {0} to {1} in IBGE cod {2}".format(old_mdic,mdic_cod,ibge_cod) sql="select * from secex_yb where bra_id='{0}'".format(old_idbra) cursor.execute(sql) if len(cursor.fetchall())>0: print "Exports found for {0}".format(ibge_cod) sql="update attrs_bra set id_mdic = {0} where id_ibge = {1}".format(mdic_cod,ibge_cod) #cursor.execute(sql) else: print "Error finding a IBGE {0}: Found {1}".format(ibge_cod,size)
def to_df(input_file_path, separator=";", index=False): input_file = get_file(input_file_path) s = time.time() df = pandas.read_csv(input_file, sep=separator, engine='c', decimal=',', encoding='utf-8') print (time.time() - s) / 60.0, "minutes to read." return df
def main(first_year_str, second_year_str, cols, output_path, edu=None, years=1, strcasts=""): start = time.time() step = 0 if strcasts: strcastlist = strcasts.split(",") converters = {x: str for x in strcastlist} else: converters = {} step += 1 print print '''STEP {0}: \nCalculate 1 year growth'''.format(step) orig_path = get_file(first_year_str) df1 = pd.read_csv(orig_path, sep="\t", converters=converters) new_path = get_file(second_year_str) df2 = pd.read_csv(new_path, sep="\t", converters=converters) col_names = cols.split(",") print "CALCULATING growth for the following columns:", col_names t_name = parse_table_name(second_year_str) df2 = do_growth(t_name, df2, df1, col_names, years, edu) print "GOT TABLE NAME OF ", t_name if not t_name: t_name = "noname" new_file_path = os.path.abspath( os.path.join(output_path, "{0}.tsv.bz2".format(t_name))) df2.to_csv(bz2.BZ2File(new_file_path, 'wb'), sep="\t", index=False, float_format="%.4f") print("--- %s minutes ---" % str((time.time() - start) / 60))
def to_df(input_file_path, separator=";", index=False): input_file = get_file(input_file_path) s = time.time() df = pandas.read_csv(input_file, sep=separator, engine='c', decimal=',', encoding='utf-8') print(time.time() - s) / 60.0, "minutes to read." return df
def load_model(model_path=MODEL, weights_path=WEIGHTS): global _model print("load model") if _model is None: try: json_file = open(model_path, 'r') loaded_model_json = json_file.read() json_file.close() _model = model_from_json(loaded_model_json) weights = helpers.get_file(weights_path) _model.set_weights(weights) _model.compile(optimizer="adam", loss='categorical_crossentropy', metrics=['top_k_categorical_accuracy']) print("Loaded model from disk") except IOError: _model = None return _model
def to_df(input_file_path, index=False): input_file = get_file(input_file_path) s = time.time() if index: index_lookup = {"y":"year", "b":"bra_id", "i":"cnae_id", "o":"cbo_id", "d": "d_id"} index_cols = [index_lookup[i] for i in index] rais_df = pd.read_csv(input_file, sep="\t", converters={"bra_id":str, "cbo_id":str, "cnae_id":str}) rais_df = rais_df.set_index(index_cols) else: orig_cols = ['BrazilianOcupation_ID', 'EconomicAtivity_ID_CNAE', 'Literacy', 'Age', 'Establishment_ID', 'Simple', 'Municipality_ID', 'Employee_ID', 'Color', 'WageReceived', 'AverageMonthlyWage', 'Gender', 'Establishment_Size', 'Year', 'Establishment_ID_len'] cols = ["cbo_id", "cnae_id", "literacy", "age", "est_id", "simple", "bra_id", "num_emp", "color", "wage_dec", "wage", "gender", "est_size", "year"] delim = ";" coerce_cols = {"bra_id": bra_replace, "cnae_id":cnae_replace, "cbo_id":cbo_replace, \ "emp_id":str, "est_id": str, "age": convertint} rais_df = pd.read_csv(input_file, header=0, sep=delim, names=cols, converters=coerce_cols, engine='c', decimal=',') rais_df = rais_df[["year", "bra_id", "cnae_id", "cbo_id", "wage", "num_emp", "est_id", "age"]] print "first remove rows with empty ages, if any..." count = rais_df[ rais_df.age == -999 ].age.count() if count > 0: print "** REMOVED", count, "rows due to empty ages" rais_df = rais_df[ rais_df.age != -999 ] print "finding missing attrs..." for col, missings in missing.items(): if not len(missings): continue num_rows = rais_df.shape[0] print; print "[WARNING]"; print "The following {0} IDs are not in the DB and will be dropped from the data.".format(col); print list(missings) rais_df = rais_df.dropna(subset=[col]) print; print "{0} rows deleted.".format(num_rows - rais_df.shape[0]); print; print (time.time() - s) / 60.0, "minutes to read." return rais_df
def to_df(input_file_path, index=False): input_file = get_file(input_file_path) s = time.time() if index: index_lookup = {"y":"year", "b":"bra_id", "i":"cnae_id", "o":"cbo_id", "d": "d_id"} index_cols = [index_lookup[i] for i in index] rais_df = pd.read_csv(input_file, sep="\t", converters={"bra_id":str, "cbo_id":str, "cnae_id":str}) rais_df = rais_df.set_index(index_cols) else: orig_cols = ['BrazilianOcupation_ID', 'EconomicAtivity_ID_CNAE', 'Literacy', 'Age', 'Establishment_ID', 'Simple', 'Municipality_ID', 'Employee_ID', 'Color', 'WageReceived', 'AverageMonthlyWage', 'Gender', 'Establishment_Size', 'Year', 'Establishment_ID_len'] cols = ["cbo_id", "cnae_id", "literacy", "age", "est_id", "simple", "bra_id", "num_emp", "color", "wage_dec", "wage", "gender", "est_size", "year"] delim = ";" coerce_cols = {"bra_id": bra_replace, "cnae_id":cnae_replace, "cbo_id":cbo_replace, \ "emp_id":str, "est_id": str, "age": convertint} rais_df = pd.read_csv(input_file, header=0, sep=delim, names=cols, converters=coerce_cols, engine='c', decimal=',') rais_df = rais_df[["year", "bra_id", "cnae_id", "cbo_id", "wage", "num_emp", "est_id", "age", "color", "gender", "est_size", "literacy"]] print "first remove rows with empty ages, if any..." count = rais_df[ rais_df.age == -999 ].age.count() if count > 0: print "** REMOVED", count, "rows due to empty ages" rais_df = rais_df[ rais_df.age != -999 ] print "finding missing attrs..." for col, missings in missing.items(): if not len(missings): continue num_rows = rais_df.shape[0] print; print "[WARNING]"; print "The following {0} IDs are not in the DB and will be dropped from the data.".format(col); print list(missings) # drop_criterion = rais_df[col].map(lambda x: x not in vals) # rais_df = rais_df[drop_criterion] rais_df = rais_df.dropna(subset=[col]) print; print "{0} rows deleted.".format(num_rows - rais_df.shape[0]); print; print "generating demographic codes..." FEMALE, MALE = 0, 1 gender_dict = {MALE: 'A', FEMALE: 'B'} rais_df["gender"] = rais_df["gender"].replace(gender_dict) INDIAN, WHITE, BLACK, ASIAN, MULTI, UNKNOWN = 1,2,4,6,8,9 color_dict = {INDIAN:'C', WHITE:'D', BLACK:'E', ASIAN:'F', MULTI:'G', UNKNOWN:'H', -1:'H' } rais_df["color"] = rais_df["color"].replace(color_dict) lit_dict = {1:'I', 2:'I', 3:'J', 4:'J', 5:'J', 6:'J', 7:'K', 8:'K', 9:'L', -1:'M'} rais_df["literacy"] = rais_df["literacy"].replace(lit_dict) rais_df["age_demo"] = (rais_df["age"] / 10).astype(int) rais_df["age_demo"] = rais_df["age_demo"].clip(1,6) rais_df["d_id"] = rais_df['gender'].str.cat([rais_df['age_demo'].values.astype(str), rais_df['color'].values.astype(str), rais_df['literacy'].values.astype(str)]) rais_df = rais_df.drop(["gender", "color", "age_demo", "literacy"], axis=1) # rais_df["new_est_size"] = rais_df["cnae_id"].str.slice(1, 3).astype(int) # rais_df["new_est_size"][rais_df["new_est_size"].between(5, 35)] = -1 # rais_df["new_est_size"][rais_df["new_est_size"] >= 0] = 0 # # print rais_df["new_est_size"].mask(rais_df["new_est_size"] >= 0).head() # print rais_df["new_est_size"].where(rais_df["new_est_size"] >= 0).head() print "determining establishment sizes..." rais_df["new_est_size_1"] = rais_df["cnae_id"].str.slice(1, 3).astype(int) rais_df.loc[rais_df["new_est_size_1"].between(5, 35),"new_est_size_1"] = -1 rais_df.loc[rais_df["new_est_size_1"] >= 0,"new_est_size_1"] = 0 rais_df["new_est_size_2"] = rais_df["new_est_size_1"].mask(rais_df["new_est_size_1"] >= 0).head() rais_df["new_est_size_1"] = rais_df["new_est_size_1"].where(rais_df["new_est_size_1"] >= 0).head() rais_df.loc[rais_df["new_est_size_2"]==-1,"new_est_size_2"] = 0 rais_df["new_est_size_2"] = rais_df["new_est_size_2"] + rais_df["est_size"] rais_df["new_est_size_1"] = rais_df["new_est_size_1"] + rais_df["est_size"] est_size_1_lookup = {1:0, 2:0, 3:1, 4:1, 5:2, 6:3, 7:3, 8:3, 9:3} est_size_2_lookup = {1:0, 2:0, 3:0, 4:1, 5:1, 6:2, 7:2, 8:3, 9:3} rais_df["new_est_size_1"] = rais_df["new_est_size_1"].replace(est_size_1_lookup) rais_df["new_est_size_2"] = rais_df["new_est_size_2"].replace(est_size_2_lookup) rais_df["est_size"] = rais_df["new_est_size_1"].fillna(0) + rais_df["new_est_size_2"].fillna(0) rais_df = rais_df.drop(["new_est_size_1", "new_est_size_2"], axis=1) print (time.time() - s) / 60.0, "minutes to read." return rais_df
def to_df(input_file_path, index=False): input_file = get_file(input_file_path) s = time.time() if index: index_lookup = { "y": "year", "b": "bra_id", "i": "cnae_id", "o": "cbo_id", "d": "d_id" } index_cols = [index_lookup[i] for i in index] rais_df = pd.read_csv(input_file, sep="\t", converters={ "bra_id": str, "cbo_id": str, "cnae_id": str }) rais_df = rais_df.set_index(index_cols) else: orig_cols = [ 'BrazilianOcupation_ID', 'EconomicAtivity_ID_CNAE', 'Literacy', 'Age', 'Establishment_ID', 'Simple', 'Municipality_ID', 'Employee_ID', 'Color', 'WageReceived', 'AverageMonthlyWage', 'Gender', 'Establishment_Size', 'Year', 'Establishment_ID_len' ] cols = [ "cbo_id", "cnae_id", "literacy", "age", "est_id", "simple", "bra_id", "num_emp", "color", "wage_dec", "wage", "gender", "est_size", "year" ] delim = ";" coerce_cols = {"bra_id": bra_replace, "cnae_id":cnae_replace, "cbo_id":cbo_replace, \ "emp_id":str, "est_id": str, "age": convertint} rais_df = pd.read_csv(input_file, header=0, sep=delim, names=cols, converters=coerce_cols, engine='c', decimal=',') rais_df = rais_df[[ "year", "bra_id", "cnae_id", "cbo_id", "wage", "num_emp", "est_id", "age" ]] print "first remove rows with empty ages, if any..." count = rais_df[rais_df.age == -999].age.count() if count > 0: print "** REMOVED", count, "rows due to empty ages" rais_df = rais_df[rais_df.age != -999] print "finding missing attrs..." for col, missings in missing.items(): if not len(missings): continue num_rows = rais_df.shape[0] print print "[WARNING]" print "The following {0} IDs are not in the DB and will be dropped from the data.".format( col) print list(missings) rais_df = rais_df.dropna(subset=[col]) print print "{0} rows deleted.".format(num_rows - rais_df.shape[0]) print print(time.time() - s) / 60.0, "minutes to read." return rais_df
def to_df(input_file_path, index=False): input_file = get_file(input_file_path) s = time.time() if index: index_lookup = { "y": "year", "b": "bra_id", "i": "cnae_id", "o": "cbo_id", "d": "d_id" } index_cols = [index_lookup[i] for i in index] rais_df = pd.read_csv(input_file, sep="\t", converters={ "bra_id": str, "cbo_id": str, "cnae_id": str }) rais_df = rais_df.set_index(index_cols) else: orig_cols = [ 'BrazilianOcupation_ID', 'EconomicAtivity_ID_CNAE', 'Literacy', 'Age', 'Establishment_ID', 'Simple', 'Municipality_ID', 'Employee_ID', 'Color', 'WageReceived', 'AverageMonthlyWage', 'Gender', 'Establishment_Size', 'Year', 'Establishment_ID_len' ] cols = [ "cbo_id", "cnae_id", "literacy", "age", "est_id", "simple", "bra_id", "num_emp", "color", "wage_dec", "wage", "gender", "est_size", "year" ] delim = ";" coerce_cols = {"bra_id": bra_replace, "cnae_id":cnae_replace, "cbo_id":cbo_replace, \ "emp_id":str, "est_id": str, "age": convertint} rais_df = pd.read_csv(input_file, header=0, sep=delim, names=cols, converters=coerce_cols, engine='c', decimal=',') rais_df = rais_df[[ "year", "bra_id", "cnae_id", "cbo_id", "wage", "num_emp", "est_id", "age", "color", "gender", "est_size", "literacy" ]] print "first remove rows with empty ages, if any..." count = rais_df[rais_df.age == -999].age.count() if count > 0: print "** REMOVED", count, "rows due to empty ages" rais_df = rais_df[rais_df.age != -999] print "finding missing attrs..." for col, missings in missing.items(): if not len(missings): continue num_rows = rais_df.shape[0] print print "[WARNING]" print "The following {0} IDs are not in the DB and will be dropped from the data.".format( col) print list(missings) # drop_criterion = rais_df[col].map(lambda x: x not in vals) # rais_df = rais_df[drop_criterion] rais_df = rais_df.dropna(subset=[col]) print print "{0} rows deleted.".format(num_rows - rais_df.shape[0]) print print "generating demographic codes..." FEMALE, MALE = 0, 1 gender_dict = {MALE: 'A', FEMALE: 'B'} rais_df["gender"] = rais_df["gender"].replace(gender_dict) INDIAN, WHITE, BLACK, ASIAN, MULTI, UNKNOWN = 1, 2, 4, 6, 8, 9 color_dict = { INDIAN: 'C', WHITE: 'D', BLACK: 'E', ASIAN: 'F', MULTI: 'G', UNKNOWN: 'H', -1: 'H' } rais_df["color"] = rais_df["color"].replace(color_dict) lit_dict = { 1: 'I', 2: 'I', 3: 'J', 4: 'J', 5: 'J', 6: 'J', 7: 'K', 8: 'K', 9: 'L', -1: 'M' } rais_df["literacy"] = rais_df["literacy"].replace(lit_dict) rais_df["age_demo"] = (rais_df["age"] / 10).astype(int) rais_df["age_demo"] = rais_df["age_demo"].clip(1, 6) rais_df["d_id"] = rais_df['gender'].str.cat([ rais_df['age_demo'].values.astype(str), rais_df['color'].values.astype(str), rais_df['literacy'].values.astype(str) ]) rais_df = rais_df.drop(["gender", "color", "age_demo", "literacy"], axis=1) # rais_df["new_est_size"] = rais_df["cnae_id"].str.slice(1, 3).astype(int) # rais_df["new_est_size"][rais_df["new_est_size"].between(5, 35)] = -1 # rais_df["new_est_size"][rais_df["new_est_size"] >= 0] = 0 # # print rais_df["new_est_size"].mask(rais_df["new_est_size"] >= 0).head() # print rais_df["new_est_size"].where(rais_df["new_est_size"] >= 0).head() print "determining establishment sizes..." rais_df["new_est_size_1"] = rais_df["cnae_id"].str.slice(1, 3).astype(int) rais_df.loc[rais_df["new_est_size_1"].between(5, 35), "new_est_size_1"] = -1 rais_df.loc[rais_df["new_est_size_1"] >= 0, "new_est_size_1"] = 0 rais_df["new_est_size_2"] = rais_df["new_est_size_1"].mask( rais_df["new_est_size_1"] >= 0).head() rais_df["new_est_size_1"] = rais_df["new_est_size_1"].where( rais_df["new_est_size_1"] >= 0).head() rais_df.loc[rais_df["new_est_size_2"] == -1, "new_est_size_2"] = 0 rais_df[ "new_est_size_2"] = rais_df["new_est_size_2"] + rais_df["est_size"] rais_df[ "new_est_size_1"] = rais_df["new_est_size_1"] + rais_df["est_size"] est_size_1_lookup = { 1: 0, 2: 0, 3: 1, 4: 1, 5: 2, 6: 3, 7: 3, 8: 3, 9: 3 } est_size_2_lookup = { 1: 0, 2: 0, 3: 0, 4: 1, 5: 1, 6: 2, 7: 2, 8: 3, 9: 3 } rais_df["new_est_size_1"] = rais_df["new_est_size_1"].replace( est_size_1_lookup) rais_df["new_est_size_2"] = rais_df["new_est_size_2"].replace( est_size_2_lookup) rais_df["est_size"] = rais_df["new_est_size_1"].fillna( 0) + rais_df["new_est_size_2"].fillna(0) rais_df = rais_df.drop(["new_est_size_1", "new_est_size_2"], axis=1) print(time.time() - s) / 60.0, "minutes to read." return rais_df