def make_csv_allData():
	# code for making the data
	# create the master dictionary - key is "96" of the year, and value is the dictionary
	master_dictionary = allYears_listData_dict()
	k_years = ["96", "97", "98", "99", "00", "01", "02", "03", "04", "05", "06", "07", "08", "09", "10", "11", "12", "13", "14"] 

	# test to see whats in master_dicitonary
	# pprint.pprint(master_dictionary[k_years[0]]) # returns a dictionary of lists *salary contains upl
	df_list = []
	for y in range(len(k_years)):
		year_data = master_dictionary[k_years[y]]
		df = df_name_salary_position(k_years[y], True)
		# check to see if all the lists are the same lengths:
		year_list = []
		if (len(year_data["salary_list"]) == len(year_data["employee_list"]) and len(year_data["position_list"])):
			year_str = k_years[y]
			if (int(year_str)<20):     # will have to change after 2020!
				year_str = "20" + year_str
			else:
				year_str = "19" + year_str
			# print year_str
			year_list = [year_str] * len(year_data["salary_list"])
		# pprint.pprint(year_list)
		# add the new column of the data to dataframe of name, position, salary
		df["Year"] = year_list
		df_list.append(df)
	# made a list of the data frame! time to concatonate it
	df_allData = pd.concat(df_list)
	# pprint.pprint(df_allData)

	# Make a csv of all the data!
	df_allData.to_csv(path_or_buf="uvm_employee_salary_data_1994-2014.csv", index=False, columns = ["Year", "Name", "Position", "Salary"])
def boxplot_salary_allYears():
    # make the master dictionary of all the data
    master = allYears_listData_dict()
    # make a dataframe of all the salaries for a year, for all the data
    df = df_allYears_salary(master)
    plt.figure()
    df.boxplot(return_type='axes')
    ## add title and label axis
    plt.ylabel('Salary (USD)')
    plt.xlabel("School Year")
    plt.title("Distribution of UVM employee salary from 1996 - 2014")
    plt.show(block=True)  # this is code is needed if run outside of canopy
def boxplot_salary_allYears():      
    # make the master dictionary of all the data  
    master = allYears_listData_dict()
    # make a dataframe of all the salaries for a year, for all the data
    df = df_allYears_salary(master)
    plt.figure()
    df.boxplot(return_type='axes')
    ## add title and label axis
    plt.ylabel('Salary (USD)')
    plt.xlabel("School Year")
    plt.title("Distribution of UVM employee salary from 1996 - 2014")
    plt.show(block=True)  # this is code is needed if run outside of canopy
Exemplo n.º 4
0
def make_csv_allData():
    # code for making the data
    # create the master dictionary - key is "96" of the year, and value is the dictionary
    master_dictionary = allYears_listData_dict()
    k_years = [
        "96", "97", "98", "99", "00", "01", "02", "03", "04", "05", "06", "07",
        "08", "09", "10", "11", "12", "13", "14"
    ]

    # test to see whats in master_dicitonary
    # pprint.pprint(master_dictionary[k_years[0]]) # returns a dictionary of lists *salary contains upl
    df_list = []
    for y in range(len(k_years)):
        year_data = master_dictionary[k_years[y]]
        df = df_name_salary_position(k_years[y], True)
        # check to see if all the lists are the same lengths:
        year_list = []
        if (len(year_data["salary_list"]) == len(year_data["employee_list"])
                and len(year_data["position_list"])):
            year_str = k_years[y]
            if (int(year_str) < 20):  # will have to change after 2020!
                year_str = "20" + year_str
            else:
                year_str = "19" + year_str
            # print year_str
            year_list = [year_str] * len(year_data["salary_list"])
        # pprint.pprint(year_list)
        # add the new column of the data to dataframe of name, position, salary
        df["Year"] = year_list
        df_list.append(df)
    # made a list of the data frame! time to concatonate it
    df_allData = pd.concat(df_list)
    # pprint.pprint(df_allData)

    # Make a csv of all the data!
    df_allData.to_csv(path_or_buf="uvm_employee_salary_data_1994-2014.csv",
                      index=False,
                      columns=["Year", "Name", "Position", "Salary"])
Exemplo n.º 5
0
##### IMPORT STATEMENTS: #####
from allYears_listData_dict import allYears_listData_dict
from df_allYears_salary import df_allYears_salary
import pandas as pd
import pprint
pd.set_option('max_colwidth',
              30)  # This is so all the columns can show on my mac

# TEMPORARY ##################################
# make a local reference of the master dictionary of all the data
master = allYears_listData_dict()


def get_outliers(year_str, master_dict):
    #     # make a local reference of the master dictionary of all the data
    #     master = allYears_listData_dict()
    #     #make the complete dataframe of all years
    df_allSalary = df_allYears_salary(master_dict)

    # Find out what the upper bound for top outliers is:
    std = df_allSalary.loc[:, year_str].std()
    mean = df_allSalary.loc[:, year_str].mean()
    upper_bound = mean + (2 * std)
    # making a copy
    # df_copy = df_allSalary.loc[:,year_str].copy()
    # df_copy = [df_copy > upper_bound]
    # print df_copy

    # Initialize variables for outliers
    index_outliers = []  # collects the index of outlier
    salary_outliers = []  #gets the float salary of outliers
def df_avgSalary_position(year_str, sort_boolean):
    # create a local All_Data_Dict
    data = allYears_listData_dict()

    ### temporary lists ####
    index_hits = []
    salary_float = data[year_str]["salary_float"]
    position_list = data[year_str]["position_list"]
    employee_list = data[year_str]["employee_list"]  #for validation purpose!!!

    #set that contains a unique position
    unique_pos_set = set(position_list)  #makes
    ## Backbone of a potential dataframe
    unique_pos_list = []
    sum_salary = []
    index_employees = []
    avg_salary = []
    paid_employees_count = []

    # convert the set data to a a list
    while (len(unique_pos_set) !=
           0):  # checks to see if the unique_positions is not empty
        popped_off = unique_pos_set.pop()
        unique_pos_list.append(popped_off)

    #### Loop through all the positions in the list
    ### see what indexes each employee matches to
    for i in range(len(unique_pos_list)):
        position_name = unique_pos_list[i]
        index_employees_matches = []
        for j in range(len(position_list)):
            if (position_list[j] == position_name):
                index_employees_matches.append(j)
            else:
                pass
        index_employees.append(index_employees_matches)

    # go through the list again, and add the sum salary for each list
    for i in range(len(unique_pos_list)):
        #initialize the total salary
        total_salary = 0
        unpaid_leave = 0
        for k in range(len(index_employees[i])):
            #index_employee[k]
            salary_index = index_employees[i][k]
            try:
                total_salary += salary_float[salary_index]
            except:
                unpaid_leave += 1
        sum_salary.append({"total": total_salary, "upl": unpaid_leave})

    # go through the list again, and add the sum salary for each list
    for i in range(len(unique_pos_list)):
        total_salary = sum_salary[i]["total"]
        # total paid employees is the number of employees minus unpaid
        total_employees = len(index_employees[i]) - sum_salary[i]["upl"]
        if (total_employees <= 0):
            average_salary = 0
        else:
            average_salary = total_salary / total_employees
#             avg_salary.append(average_salary)
        avg_salary.append(
            '{:20,.2f}'.format(average_salary)
        )  ### THIS IS FOR FORMATED Version, can't get sorted!
        # avg_salary.append(average_salary)
        paid_employees_count.append(total_employees)
#         avg_salary.append(format(average_salary, ".2"))

#     # Create a dataframe!!
    data = zip(unique_pos_list, avg_salary, paid_employees_count)
    df = pd.DataFrame(
        data,
        columns=["Employee_Position", "Average_Salary", "Number_of_Employees"])
    if (sort_boolean):
        df = df.sort(["Average_Salary"], ascending=False)
    return df
#    if (sort_boolean):
# 	df.sort(["Average Salary"], ascending=False)
# return df


# ###### CALL FUNCTION / TESTING #####################
# b = df_avgSalary_position("14", True)
# print b
Exemplo n.º 7
0
##### IMPORT STATEMENTS: #####
from allYears_listData_dict import allYears_listData_dict
from df_allYears_salary import df_allYears_salary
import pandas as pd
import pprint
pd.set_option('max_colwidth', 30) # This is so all the columns can show on my mac


# TEMPORARY ##################################
# make a local reference of the master dictionary of all the data
master = allYears_listData_dict()


def get_outliers(year_str, master_dict):
#     # make a local reference of the master dictionary of all the data
#     master = allYears_listData_dict()
#     #make the complete dataframe of all years
    df_allSalary= df_allYears_salary(master_dict)
    
    # Find out what the upper bound for top outliers is:
    std = df_allSalary.loc[:, year_str].std()
    mean = df_allSalary.loc[:, year_str].mean()
    upper_bound = mean + (2*std)
    # making a copy
    # df_copy = df_allSalary.loc[:,year_str].copy()
    # df_copy = [df_copy > upper_bound]
    # print df_copy
    
    # Initialize variables for outliers
    index_outliers = [] # collects the index of outlier
    salary_outliers = []  #gets the float salary of outliers
def df_name_salary_position(year_str, sort_boolean):
    # create a local All_Data_Dict
    data = allYears_listData_dict()

    ### DECLARING LISTS ####
    #describes the hits of the unpaid leave employees
    index_hits = []
    salary_float = data[year_str]["salary_float"]
    salary_list = data[year_str]["salary_list"]
    position_list = data[year_str]["position_list"]
    employee_list = data[year_str]["employee_list"]

    ########### DEBUGGING ############

    if ((len(salary_float) == len(position_list)) and len(position_list) == len(employee_list)):
    	print "all incoming lists are the same length"
    else:
    	print len(salary_float)
    	print len(position_list)
    	print len(employee_list)
    ########## DEBUGGING ######
    # "06"
    # Debugging .... there is no Carr,Frances!!!!
  #   for i in range(len(employee_list)):
		# if ("Carr" in employee_list[i]):
		# 	print employee_list[i]
	# Fixed Yields:
			# Carr,Elizabeth Anne
			# Carr,Frances Eileen
			# Carr,Jacqueline B.
			# Carr,Jeanine M.
			# Devoid,Rick Carroll
			# Honeman,Carrie Ann
	# Previous yielded this below:
			# Carr,Elizabeth Anne
			# Carr,Jeanine M.
			# Carrard,Philippe
			# Carrigan,Linda Jean
			# Carris,Marschelle R.
			# Carroll Higgins,Linda Joan
			# Carroll,John A.
			# Clough,Carrie Mae
			# Honeman,Carrie Ann
			# Lewin,Carroll
			# MISSING PEOPLE:
			 # 'Carr,Frances Eileen',
			 # 'Carr,Jacqueline B.',

    ## DEBUGGING:
    # print len(salary_float)
    # print len(position_list)
    # print len(employee_list)
    # check_list = [1,100,500,1000,2000]
    # for i in range(len(check_list)):
    # 	index = check_list[i]
    # 	print ("******************")
    # 	print employee_list[index]
    # 	print salary_float[index]
    # 	print position_list[index]


#     #these will be the copies of the original lists, but without upl employees
    e_list = []
    p_list = []
    s_float = []
    s_list = []
    index_hits = []
    
    ## Take care of the UPL salary option ###
    for i in range(len(salary_float)):
        if (salary_float[i] != ""):
            pass
#             salary_float[i] = '{:20,.2f}'.format(salary_float[i])
        else:
            index_hits.append(i)
            salary_float[i] = 0
            
    for j in range(len(employee_list)):
        if (j in index_hits):
            pass
        else:
        	# Original code #########
            # e_list.append(employee_list[j])
            # p_list.append(position_list[j])
            # s_float.append(salary_float[j])
            ##########################
            try:
                e_list.append(employee_list[j])
	        p_list.append(position_list[j])
                s_list.append(salary_list[j])
	        s_float.append(salary_float[j])
            except:
            	print j





#     # Create a dataframe!!      
    # data = zip(e_list, p_list, s_float)
    data = zip(employee_list, position_list, salary_float)
    df = pd.DataFrame(data, columns=["Name", "Position", "Salary"])
    if (sort_boolean):
    	df = df.sort(["Salary"], ascending=False)

    return df