def acquire_career_info(df): """Career info is list_of_dfs[0]""" # Variables for this df initial_cols = ['dem_education_level'] final_cols = ['High_Ed', 'Low_Ed', 'Medium_Ed', 'No_Ed', 'Unknown_Ed'] try: print(f'· Cleaning df_career_info ....') # Binarizing data df['dem_full_time_job'] = m_ac.yes_no_to_bool(df['dem_full_time_job']) # Changing nulls by unknown (qualitative params [low, medium, high, unknown]) df['dem_education_level'] = m_ac.null_to_unknown( df['dem_education_level']) new_bool_df = m_ac.separate_df_to_bools(df, initial_cols, final_cols) df = df.join(other=new_bool_df, on=None, how='left', sort=False) # Dropping duplicate cols cols_to_del = ['dem_education_level'] df.drop(columns=cols_to_del, inplace=True) # Save table into local folder m_ac.save_df_to_csv(df, path=PATH_TO_SAVE_CSV, name=FILE_NAMES_FOR_CSV[0]) return df except: print('Something went wrong with [acquire_table_career_info]') finally: # Memory Usage and objects manually from Jupyter file print('''\n\t\t\t >> Done cleaning df_career_info!. \t\t\t\t >> Chekout /data/processed/\n''')
def acquire_country_info(df): """Country info is list_of_dfs[1]""" # Variables for this df initial_cols = ['rural'] final_cols = ['rural_context', 'urban_context'] try: print(f'· Cleaning df_country_info ....') # String Operations multiple inputs to binomial cols -> only 2 values from 2 options df['rural'] = m_ac.context_homogenization(df['rural']) new_bool_df = m_ac.separate_df_to_bools(df, initial_cols, final_cols) df = df.join(other=new_bool_df, on=None, how='left', sort=False) df.drop(columns='rural', inplace=True) # Save table into local folder m_ac.save_df_to_csv(df, path=PATH_TO_SAVE_CSV, name=FILE_NAMES_FOR_CSV[1]) return df except: print('Something went wrong with [acquire_career_info]') finally: print('''\t\t\t >> Done cleaning df_country_info!. \t\t\t\t >> Chekout /data/processed/\n''')
def add_country_col(df_to_change, countries_dict): """ Transforms DF and adds a Col with WEB Information, saves it and returns it """ # Variables path = 'data/processed/' name = 'country_info' col_name_to_add = 'country_names' col_name_reference = 'country_code' print(f'\n\n· Adding column to csv located at {path}....') try: # Action to apply df_to_change[col_name_to_add] = countryCode_to_countryName( serie=df_to_change[col_name_reference], dict=countries_dict) # Save table into local folder print(f'\t ··· ready to rewrite..') m_ac.save_df_to_csv( df_to_change, path=path, # Function adds hierarchy of files name=name) # Name of csv return df_to_change except: print('Something went wrong at [add_col_to_csv]') finally: print('''\t\t\t >> Done adding web scrapping information!. \t\t\t\t >> Chekout /data/processed/''')
def add_jobs_column(df_to_change): """ Opens csv, transforms it to a DF and adds a Col with WEB Information """ # Variables path = 'data/processed/' csv_name = 'career_info' col_name_to_add = 'normalized_job_names' col_name_reference = 'normalized_job_code' print(f'\n\n· Adding column to csv located at {path}....') try: uuid_db = df_to_change[col_name_reference].unique().tolist() print(f' ·· Threading API response to get dicts...') lst_dicts = threads_runner_for_API(uuid_db) df_changed = change_temp_df(list_of_dict_API=lst_dicts, df_to_change=df_to_change, col_name_reference=col_name_reference, col_name_to_add=col_name_to_add) print(f' ·· Adding new col to {csv_name}.csv ....') m_ac.save_df_to_csv( df_changed, path=path, # Function adds hierarchy of files name=csv_name) # Name of csv""" return df_changed except: print('Something went wrong at [add_col_to_csv]') finally: print('''\t\t\t >> Done adding web scrapping information!. \t\t\t\t >> Checkout /data/processed/''')
def get_separate_df(serie_to_eval, separator_string, file_name): """ INPUT -> inputs to calls formentioned defs OUTPUT -> saves them into a common zip file and return a list of alls dfs """ print(f'\t ···· Iterating through poll lists') df = m_ac.multiple_choice_col_to_df(serie=serie_to_eval, separator=separator_string) m_ac.save_df_to_csv(df, path=PATH_TO_SAVE_CSV, name=file_name) return df
def acquire_personal_info(df): """Personal info is list_of-dfs[2]""" # Variables in df initial_cols = ['age_group'] final_cols = [ 'ageGroup_14_25', 'ageGroup_26_39', 'ageGroup_40_65', 'ageGroup_juvenile' ] try: print(f'· Cleaning df_personal_info ....') # Number normalization df['age'] = m_ac.ageStr_to_ageNum(serie=df['age']) df['age'] = m_ac.year_to_age(df['age']) # String Operations: multiple inputs in binomial cols -> only 2 values for 2 options df['gender'] = m_ac.gender_homogenization(df['gender']) df['dem_has_children'] = m_ac.yes_no_to_bool(df['dem_has_children']) # Separate cols for boolean options new_bool_df = m_ac.separate_df_to_bools(df, initial_cols, final_cols) df = df.join(other=new_bool_df, on=None, how='left', sort=False) # Save table into local folder m_ac.save_df_to_csv(df, path=PATH_TO_SAVE_CSV, name=FILE_NAMES_FOR_CSV[2]) return df except: print('Something went wrong with [acquire_table_personal_info]' ) # Make a log file finally: print('''\t\t\t >> Done cleaning df_personal_info!. \t\t\t\t >> Chekout /data/processed/\n''')
def get_percentages_gender_by_job(base_analysis_df): """ INPUT -> from all cleaned dfs, joined useful columns by uuid OUTPUT -> csv with percentages by country, job and gender """ # Variables. filtr = ['country_names', 'normalized_job_names', 'gender'] drop_cols = ['uuid', 'dem_full_time_job', 'High_Ed', 'Low_Ed', 'Medium_Ed', 'No_Ed', 'totals_per_country'] new_cols = ['quantity', 'percentage'] # Add first col = quantity df_job_gender = base_analysis_df.assign(quantity= 1) \ .drop(columns=drop_cols[0:-1]) \ .groupby(filtr) \ .agg('count') \ .reset_index() # Generate totals_per_country df_total_per_country = df_job_gender.groupby(filtr[0]) \ [filtr[1]] \ .nunique() \ .to_frame() \ .rename(columns={filtr[1]: drop_cols[-1]}) df_job_gender = df_job_gender.merge(df_total_per_country, on=filtr[0]) # Add second col == percentage and deleting totals_per_country when not need df_job_gender[new_cols[1]] = round(df_job_gender[new_cols[0]] / df_job_gender[drop_cols[-1]] * 100, 3) df_job_gender.drop(columns=[drop_cols[-1]], inplace=True) # Save table into local folder m_ac.save_df_to_csv(df_job_gender, path='data/results', # Function adds hierarchy of files name=f'df_percentage_by_job_and_gender') # Name of csv return df_job_gender
def get_df_top_skills(country_argument, num_top_skills, list_dfs_cleaned, gender="All"): # Variables ed_levels = ['High_Ed', 'Medium_Ed', 'Low_Ed'] path_to_save = 'data/results' # Tuples of top skills jobs and counters serie_top_skills_high_ed, counts_top_skills_high_ed = top_skills_by_ed_level(base_analysis_df=get_base_analysis_df(country_argument=country_argument, list_of_clean_df= list_dfs_cleaned), number_skills=num_top_skills, ed_level=ed_levels[0], gender_to_eval= gender) serie_top_skills_medium_ed, counts_top_skills_medium_ed = top_skills_by_ed_level(base_analysis_df=get_base_analysis_df(country_argument=country_argument, list_of_clean_df= list_dfs_cleaned), number_skills=num_top_skills, ed_level=ed_levels[1], gender_to_eval=gender) serie_top_skills_low_ed, counts_top_skills_low_ed = top_skills_by_ed_level(base_analysis_df=get_base_analysis_df(country_argument=country_argument, list_of_clean_df= list_dfs_cleaned), number_skills=num_top_skills, ed_level=ed_levels[2], gender_to_eval=gender) cols = dict(zip([n for n in range(5)], ['#' + str(n) for n in range(5)])) # Construccion of DF of jobs all_dfs = [serie_top_skills_high_ed, serie_top_skills_medium_ed, serie_top_skills_low_ed] result_dfs = pd.concat(all_dfs, axis=1, sort=False) df_to_save = result_dfs.T.rename(columns=cols) # Construction of DF of counts all_counts = [counts_top_skills_high_ed, counts_top_skills_medium_ed, counts_top_skills_low_ed] result_counts = pd.DataFrame(all_counts) # Save DataFrame to csv in data/results m_ac.save_df_to_csv(df_to_save, path=path_to_save, # Function adds hierarchy of files name=f'df_top_skills') # Name of csv return df_to_save, result_counts, gender