예제 #1
0
def acquire_career_info(df):
    """Career info is list_of_dfs[0]"""
    # Variables for this df
    initial_cols = ['dem_education_level']
    final_cols = ['High_Ed', 'Low_Ed', 'Medium_Ed', 'No_Ed', 'Unknown_Ed']

    try:
        print(f'· Cleaning df_career_info ....')
        # Binarizing data
        df['dem_full_time_job'] = m_ac.yes_no_to_bool(df['dem_full_time_job'])

        # Changing nulls by unknown (qualitative params [low, medium, high, unknown])
        df['dem_education_level'] = m_ac.null_to_unknown(
            df['dem_education_level'])

        new_bool_df = m_ac.separate_df_to_bools(df, initial_cols, final_cols)
        df = df.join(other=new_bool_df, on=None, how='left', sort=False)

        # Dropping duplicate cols
        cols_to_del = ['dem_education_level']
        df.drop(columns=cols_to_del, inplace=True)

        # Save table into local folder
        m_ac.save_df_to_csv(df,
                            path=PATH_TO_SAVE_CSV,
                            name=FILE_NAMES_FOR_CSV[0])

        return df
    except:
        print('Something went wrong with [acquire_table_career_info]')

    finally:
        # Memory Usage and objects manually from Jupyter file
        print('''\n\t\t\t  >> Done cleaning df_career_info!. 
        \t\t\t\t  >> Chekout /data/processed/\n''')
예제 #2
0
def acquire_country_info(df):
    """Country info is list_of_dfs[1]"""
    # Variables for this df
    initial_cols = ['rural']
    final_cols = ['rural_context', 'urban_context']

    try:
        print(f'· Cleaning df_country_info ....')
        # String Operations multiple inputs to binomial cols -> only 2 values from 2 options
        df['rural'] = m_ac.context_homogenization(df['rural'])

        new_bool_df = m_ac.separate_df_to_bools(df, initial_cols, final_cols)
        df = df.join(other=new_bool_df, on=None, how='left', sort=False)
        df.drop(columns='rural', inplace=True)

        # Save table into local folder
        m_ac.save_df_to_csv(df,
                            path=PATH_TO_SAVE_CSV,
                            name=FILE_NAMES_FOR_CSV[1])

        return df

    except:
        print('Something went wrong with [acquire_career_info]')

    finally:
        print('''\t\t\t  >> Done cleaning df_country_info!.
        \t\t\t\t  >> Chekout /data/processed/\n''')
예제 #3
0
def add_country_col(df_to_change, countries_dict):
    """
    Transforms DF and adds a Col with WEB Information, saves it and returns it
    """
    # Variables
    path = 'data/processed/'
    name = 'country_info'
    col_name_to_add = 'country_names'
    col_name_reference = 'country_code'

    print(f'\n\n· Adding column to csv located at {path}....')
    try:
        # Action to apply
        df_to_change[col_name_to_add] = countryCode_to_countryName(
            serie=df_to_change[col_name_reference], dict=countries_dict)

        # Save table into local folder
        print(f'\t ··· ready to rewrite..')
        m_ac.save_df_to_csv(
            df_to_change,
            path=path,  # Function adds hierarchy of files
            name=name)  # Name of csv
        return df_to_change
    except:
        print('Something went wrong at [add_col_to_csv]')

    finally:
        print('''\t\t\t  >> Done adding web scrapping information!.
        \t\t\t\t   >> Chekout /data/processed/''')
예제 #4
0
def add_jobs_column(df_to_change):
    """
    Opens csv, transforms it to a DF and adds a Col with WEB Information
    """
    # Variables
    path = 'data/processed/'
    csv_name = 'career_info'
    col_name_to_add = 'normalized_job_names'
    col_name_reference = 'normalized_job_code'

    print(f'\n\n· Adding column to csv located at {path}....')
    try:
        uuid_db = df_to_change[col_name_reference].unique().tolist()

        print(f' ·· Threading API response to get dicts...')
        lst_dicts = threads_runner_for_API(uuid_db)
        df_changed = change_temp_df(list_of_dict_API=lst_dicts,
                                    df_to_change=df_to_change,
                                    col_name_reference=col_name_reference,
                                    col_name_to_add=col_name_to_add)

        print(f' ·· Adding new col to {csv_name}.csv ....')
        m_ac.save_df_to_csv(
            df_changed,
            path=path,  # Function adds hierarchy of files
            name=csv_name)  # Name of csv"""
        return df_changed
    except:
        print('Something went wrong at [add_col_to_csv]')

    finally:
        print('''\t\t\t  >> Done adding web scrapping information!.
            \t\t\t\t   >> Checkout /data/processed/''')
예제 #5
0
def get_separate_df(serie_to_eval, separator_string, file_name):
    """
    INPUT   -> inputs to calls formentioned defs
    OUTPUT  -> saves them into a common zip file and return a list of alls dfs
    """
    print(f'\t ···· Iterating through poll lists')
    df = m_ac.multiple_choice_col_to_df(serie=serie_to_eval,
                                        separator=separator_string)
    m_ac.save_df_to_csv(df, path=PATH_TO_SAVE_CSV, name=file_name)

    return df
예제 #6
0
def acquire_personal_info(df):
    """Personal info is list_of-dfs[2]"""
    # Variables in df
    initial_cols = ['age_group']
    final_cols = [
        'ageGroup_14_25', 'ageGroup_26_39', 'ageGroup_40_65',
        'ageGroup_juvenile'
    ]

    try:
        print(f'· Cleaning df_personal_info ....')
        # Number normalization
        df['age'] = m_ac.ageStr_to_ageNum(serie=df['age'])
        df['age'] = m_ac.year_to_age(df['age'])

        # String Operations: multiple inputs in binomial cols -> only 2 values for 2 options
        df['gender'] = m_ac.gender_homogenization(df['gender'])
        df['dem_has_children'] = m_ac.yes_no_to_bool(df['dem_has_children'])

        # Separate cols for boolean options
        new_bool_df = m_ac.separate_df_to_bools(df, initial_cols, final_cols)
        df = df.join(other=new_bool_df, on=None, how='left', sort=False)

        # Save table into local folder
        m_ac.save_df_to_csv(df,
                            path=PATH_TO_SAVE_CSV,
                            name=FILE_NAMES_FOR_CSV[2])

        return df

    except:
        print('Something went wrong with [acquire_table_personal_info]'
              )  # Make a log file

    finally:
        print('''\t\t\t  >> Done cleaning df_personal_info!. 
        \t\t\t\t  >> Chekout /data/processed/\n''')
예제 #7
0
def get_percentages_gender_by_job(base_analysis_df):
    """
    INPUT  -> from all cleaned dfs, joined useful columns by uuid
    OUTPUT -> csv with percentages by country, job and gender
    """

    # Variables.
    filtr = ['country_names', 'normalized_job_names', 'gender']
    drop_cols = ['uuid', 'dem_full_time_job',
                 'High_Ed', 'Low_Ed', 'Medium_Ed', 'No_Ed',
                 'totals_per_country']
    new_cols = ['quantity', 'percentage']

    # Add first col = quantity
    df_job_gender = base_analysis_df.assign(quantity= 1) \
                                    .drop(columns=drop_cols[0:-1]) \
                                    .groupby(filtr) \
                                    .agg('count') \
                                    .reset_index()
    # Generate totals_per_country
    df_total_per_country = df_job_gender.groupby(filtr[0]) \
                                        [filtr[1]] \
                                        .nunique() \
                                        .to_frame() \
                                        .rename(columns={filtr[1]: drop_cols[-1]})
    df_job_gender = df_job_gender.merge(df_total_per_country, on=filtr[0])

    # Add second col == percentage and deleting totals_per_country when not need
    df_job_gender[new_cols[1]] = round(df_job_gender[new_cols[0]] / df_job_gender[drop_cols[-1]] * 100, 3)
    df_job_gender.drop(columns=[drop_cols[-1]], inplace=True)

    # Save table into local folder
    m_ac.save_df_to_csv(df_job_gender,
                        path='data/results',  # Function adds hierarchy of files
                        name=f'df_percentage_by_job_and_gender')  # Name of csv

    return df_job_gender
예제 #8
0
def get_df_top_skills(country_argument, num_top_skills, list_dfs_cleaned, gender="All"):
    # Variables
    ed_levels = ['High_Ed', 'Medium_Ed', 'Low_Ed']
    path_to_save = 'data/results'

    # Tuples of top skills jobs and counters
    serie_top_skills_high_ed, counts_top_skills_high_ed = top_skills_by_ed_level(base_analysis_df=get_base_analysis_df(country_argument=country_argument, list_of_clean_df= list_dfs_cleaned),
                                                                                number_skills=num_top_skills,
                                                                                ed_level=ed_levels[0],
                                                                                gender_to_eval= gender)

    serie_top_skills_medium_ed, counts_top_skills_medium_ed = top_skills_by_ed_level(base_analysis_df=get_base_analysis_df(country_argument=country_argument, list_of_clean_df= list_dfs_cleaned),
                                                                                number_skills=num_top_skills,
                                                                                ed_level=ed_levels[1],
                                                                                gender_to_eval=gender)

    serie_top_skills_low_ed, counts_top_skills_low_ed = top_skills_by_ed_level(base_analysis_df=get_base_analysis_df(country_argument=country_argument, list_of_clean_df= list_dfs_cleaned),
                                                                                number_skills=num_top_skills,
                                                                                ed_level=ed_levels[2],
                                                                                gender_to_eval=gender)

    cols = dict(zip([n for n in range(5)], ['#' + str(n) for n in range(5)]))
    # Construccion of DF of jobs
    all_dfs = [serie_top_skills_high_ed, serie_top_skills_medium_ed, serie_top_skills_low_ed]
    result_dfs = pd.concat(all_dfs, axis=1, sort=False)

    df_to_save = result_dfs.T.rename(columns=cols)

    # Construction of DF of counts
    all_counts = [counts_top_skills_high_ed, counts_top_skills_medium_ed, counts_top_skills_low_ed]
    result_counts = pd.DataFrame(all_counts)

    # Save DataFrame to csv in data/results
    m_ac.save_df_to_csv(df_to_save,
                        path=path_to_save,  # Function adds hierarchy of files
                        name=f'df_top_skills')  # Name of csv
    return df_to_save, result_counts, gender