def make_csv_allData(): # code for making the data # create the master dictionary - key is "96" of the year, and value is the dictionary master_dictionary = allYears_listData_dict() k_years = ["96", "97", "98", "99", "00", "01", "02", "03", "04", "05", "06", "07", "08", "09", "10", "11", "12", "13", "14"] # test to see whats in master_dicitonary # pprint.pprint(master_dictionary[k_years[0]]) # returns a dictionary of lists *salary contains upl df_list = [] for y in range(len(k_years)): year_data = master_dictionary[k_years[y]] df = df_name_salary_position(k_years[y], True) # check to see if all the lists are the same lengths: year_list = [] if (len(year_data["salary_list"]) == len(year_data["employee_list"]) and len(year_data["position_list"])): year_str = k_years[y] if (int(year_str)<20): # will have to change after 2020! year_str = "20" + year_str else: year_str = "19" + year_str # print year_str year_list = [year_str] * len(year_data["salary_list"]) # pprint.pprint(year_list) # add the new column of the data to dataframe of name, position, salary df["Year"] = year_list df_list.append(df) # made a list of the data frame! time to concatonate it df_allData = pd.concat(df_list) # pprint.pprint(df_allData) # Make a csv of all the data! df_allData.to_csv(path_or_buf="uvm_employee_salary_data_1994-2014.csv", index=False, columns = ["Year", "Name", "Position", "Salary"])
def boxplot_salary_allYears(): # make the master dictionary of all the data master = allYears_listData_dict() # make a dataframe of all the salaries for a year, for all the data df = df_allYears_salary(master) plt.figure() df.boxplot(return_type='axes') ## add title and label axis plt.ylabel('Salary (USD)') plt.xlabel("School Year") plt.title("Distribution of UVM employee salary from 1996 - 2014") plt.show(block=True) # this is code is needed if run outside of canopy
def boxplot_salary_allYears(): # make the master dictionary of all the data master = allYears_listData_dict() # make a dataframe of all the salaries for a year, for all the data df = df_allYears_salary(master) plt.figure() df.boxplot(return_type='axes') ## add title and label axis plt.ylabel('Salary (USD)') plt.xlabel("School Year") plt.title("Distribution of UVM employee salary from 1996 - 2014") plt.show(block=True) # this is code is needed if run outside of canopy
def make_csv_allData(): # code for making the data # create the master dictionary - key is "96" of the year, and value is the dictionary master_dictionary = allYears_listData_dict() k_years = [ "96", "97", "98", "99", "00", "01", "02", "03", "04", "05", "06", "07", "08", "09", "10", "11", "12", "13", "14" ] # test to see whats in master_dicitonary # pprint.pprint(master_dictionary[k_years[0]]) # returns a dictionary of lists *salary contains upl df_list = [] for y in range(len(k_years)): year_data = master_dictionary[k_years[y]] df = df_name_salary_position(k_years[y], True) # check to see if all the lists are the same lengths: year_list = [] if (len(year_data["salary_list"]) == len(year_data["employee_list"]) and len(year_data["position_list"])): year_str = k_years[y] if (int(year_str) < 20): # will have to change after 2020! year_str = "20" + year_str else: year_str = "19" + year_str # print year_str year_list = [year_str] * len(year_data["salary_list"]) # pprint.pprint(year_list) # add the new column of the data to dataframe of name, position, salary df["Year"] = year_list df_list.append(df) # made a list of the data frame! time to concatonate it df_allData = pd.concat(df_list) # pprint.pprint(df_allData) # Make a csv of all the data! df_allData.to_csv(path_or_buf="uvm_employee_salary_data_1994-2014.csv", index=False, columns=["Year", "Name", "Position", "Salary"])
##### IMPORT STATEMENTS: ##### from allYears_listData_dict import allYears_listData_dict from df_allYears_salary import df_allYears_salary import pandas as pd import pprint pd.set_option('max_colwidth', 30) # This is so all the columns can show on my mac # TEMPORARY ################################## # make a local reference of the master dictionary of all the data master = allYears_listData_dict() def get_outliers(year_str, master_dict): # # make a local reference of the master dictionary of all the data # master = allYears_listData_dict() # #make the complete dataframe of all years df_allSalary = df_allYears_salary(master_dict) # Find out what the upper bound for top outliers is: std = df_allSalary.loc[:, year_str].std() mean = df_allSalary.loc[:, year_str].mean() upper_bound = mean + (2 * std) # making a copy # df_copy = df_allSalary.loc[:,year_str].copy() # df_copy = [df_copy > upper_bound] # print df_copy # Initialize variables for outliers index_outliers = [] # collects the index of outlier salary_outliers = [] #gets the float salary of outliers
def df_avgSalary_position(year_str, sort_boolean): # create a local All_Data_Dict data = allYears_listData_dict() ### temporary lists #### index_hits = [] salary_float = data[year_str]["salary_float"] position_list = data[year_str]["position_list"] employee_list = data[year_str]["employee_list"] #for validation purpose!!! #set that contains a unique position unique_pos_set = set(position_list) #makes ## Backbone of a potential dataframe unique_pos_list = [] sum_salary = [] index_employees = [] avg_salary = [] paid_employees_count = [] # convert the set data to a a list while (len(unique_pos_set) != 0): # checks to see if the unique_positions is not empty popped_off = unique_pos_set.pop() unique_pos_list.append(popped_off) #### Loop through all the positions in the list ### see what indexes each employee matches to for i in range(len(unique_pos_list)): position_name = unique_pos_list[i] index_employees_matches = [] for j in range(len(position_list)): if (position_list[j] == position_name): index_employees_matches.append(j) else: pass index_employees.append(index_employees_matches) # go through the list again, and add the sum salary for each list for i in range(len(unique_pos_list)): #initialize the total salary total_salary = 0 unpaid_leave = 0 for k in range(len(index_employees[i])): #index_employee[k] salary_index = index_employees[i][k] try: total_salary += salary_float[salary_index] except: unpaid_leave += 1 sum_salary.append({"total": total_salary, "upl": unpaid_leave}) # go through the list again, and add the sum salary for each list for i in range(len(unique_pos_list)): total_salary = sum_salary[i]["total"] # total paid employees is the number of employees minus unpaid total_employees = len(index_employees[i]) - sum_salary[i]["upl"] if (total_employees <= 0): average_salary = 0 else: average_salary = total_salary / total_employees # avg_salary.append(average_salary) avg_salary.append( '{:20,.2f}'.format(average_salary) ) ### THIS IS FOR FORMATED Version, can't get sorted! # avg_salary.append(average_salary) paid_employees_count.append(total_employees) # avg_salary.append(format(average_salary, ".2")) # # Create a dataframe!! data = zip(unique_pos_list, avg_salary, paid_employees_count) df = pd.DataFrame( data, columns=["Employee_Position", "Average_Salary", "Number_of_Employees"]) if (sort_boolean): df = df.sort(["Average_Salary"], ascending=False) return df # if (sort_boolean): # df.sort(["Average Salary"], ascending=False) # return df # ###### CALL FUNCTION / TESTING ##################### # b = df_avgSalary_position("14", True) # print b
##### IMPORT STATEMENTS: ##### from allYears_listData_dict import allYears_listData_dict from df_allYears_salary import df_allYears_salary import pandas as pd import pprint pd.set_option('max_colwidth', 30) # This is so all the columns can show on my mac # TEMPORARY ################################## # make a local reference of the master dictionary of all the data master = allYears_listData_dict() def get_outliers(year_str, master_dict): # # make a local reference of the master dictionary of all the data # master = allYears_listData_dict() # #make the complete dataframe of all years df_allSalary= df_allYears_salary(master_dict) # Find out what the upper bound for top outliers is: std = df_allSalary.loc[:, year_str].std() mean = df_allSalary.loc[:, year_str].mean() upper_bound = mean + (2*std) # making a copy # df_copy = df_allSalary.loc[:,year_str].copy() # df_copy = [df_copy > upper_bound] # print df_copy # Initialize variables for outliers index_outliers = [] # collects the index of outlier salary_outliers = [] #gets the float salary of outliers
def df_name_salary_position(year_str, sort_boolean): # create a local All_Data_Dict data = allYears_listData_dict() ### DECLARING LISTS #### #describes the hits of the unpaid leave employees index_hits = [] salary_float = data[year_str]["salary_float"] salary_list = data[year_str]["salary_list"] position_list = data[year_str]["position_list"] employee_list = data[year_str]["employee_list"] ########### DEBUGGING ############ if ((len(salary_float) == len(position_list)) and len(position_list) == len(employee_list)): print "all incoming lists are the same length" else: print len(salary_float) print len(position_list) print len(employee_list) ########## DEBUGGING ###### # "06" # Debugging .... there is no Carr,Frances!!!! # for i in range(len(employee_list)): # if ("Carr" in employee_list[i]): # print employee_list[i] # Fixed Yields: # Carr,Elizabeth Anne # Carr,Frances Eileen # Carr,Jacqueline B. # Carr,Jeanine M. # Devoid,Rick Carroll # Honeman,Carrie Ann # Previous yielded this below: # Carr,Elizabeth Anne # Carr,Jeanine M. # Carrard,Philippe # Carrigan,Linda Jean # Carris,Marschelle R. # Carroll Higgins,Linda Joan # Carroll,John A. # Clough,Carrie Mae # Honeman,Carrie Ann # Lewin,Carroll # MISSING PEOPLE: # 'Carr,Frances Eileen', # 'Carr,Jacqueline B.', ## DEBUGGING: # print len(salary_float) # print len(position_list) # print len(employee_list) # check_list = [1,100,500,1000,2000] # for i in range(len(check_list)): # index = check_list[i] # print ("******************") # print employee_list[index] # print salary_float[index] # print position_list[index] # #these will be the copies of the original lists, but without upl employees e_list = [] p_list = [] s_float = [] s_list = [] index_hits = [] ## Take care of the UPL salary option ### for i in range(len(salary_float)): if (salary_float[i] != ""): pass # salary_float[i] = '{:20,.2f}'.format(salary_float[i]) else: index_hits.append(i) salary_float[i] = 0 for j in range(len(employee_list)): if (j in index_hits): pass else: # Original code ######### # e_list.append(employee_list[j]) # p_list.append(position_list[j]) # s_float.append(salary_float[j]) ########################## try: e_list.append(employee_list[j]) p_list.append(position_list[j]) s_list.append(salary_list[j]) s_float.append(salary_float[j]) except: print j # # Create a dataframe!! # data = zip(e_list, p_list, s_float) data = zip(employee_list, position_list, salary_float) df = pd.DataFrame(data, columns=["Name", "Position", "Salary"]) if (sort_boolean): df = df.sort(["Salary"], ascending=False) return df