# insert stovepipe job result into new column of proposal (month_form) # this indexes the jobs with empkeys (orig_jobs is an ndarray only) df_proposal['orig_job'] = orig_jobs # ASSIGN JOBS - flush and no flush option* # cmonths - career length in months for each employee. # length is equal to number of employees cmonths = f.career_months_df_in(df_proposal) # nonret_each_month: count of non-retired employees remaining # in each month until no more remain - # length is equal to longest career length nonret_each_month = f.count_per_month(cmonths) all_months = np.sum(nonret_each_month) cumulative = nonret_each_month.cumsum() np_low_limits = f.make_lower_slice_limits(cumulative) job_level_counts = np.array(jcnts_arr[1]) if cf.delayed_implementation: imp_month = cf.imp_month imp_low = np_low_limits[imp_month] imp_high = cumulative[imp_month] dstand = pd.read_pickle(stand_path_string) ds_option = dstand[['job_count', 'lspcnt', 'spcnt', 'rank_in_job', 'jobp']]
def main(): script, case = argv os.makedirs('dill/', exist_ok=True) try: # check to see if file exists and get value if it does case_dill_value = pd.read_pickle('dill/case_dill.pkl').case.value except OSError: case_dill_value = 'empty_placeholder' if case_dill_value == case: # if stored value is same as case study name, remove the files # which will be replaced. Removal of old files then writing the # new files to disk is faster than overwriting the old files. if os.path.isdir('dill/'): clear_files = [ 'squeeze_vals.pkl', 'last_month.pkl', 'dict_color.pkl', 'dict_settings.pkl', 'dict_attr.pkl', 'master.pkl', 'pay_table_enhanced.pkl', 'pay_table_basic.pkl' ] filelist = \ [pkl for pkl in os.listdir('dill/') if pkl in clear_files] for pkl in filelist: os.remove('dill/' + pkl) else: # if the case name is different, delete all dill files (stored # calculated files). # create new case_dill.pkl file f.clear_dill_files() case_dill = pd.DataFrame({'case': case}, index=['value']) case_dill.to_pickle('dill/case_dill.pkl') # START THE SETTINGS DICTIONARY - POPULATE WITH THE SCALARS ONLY # some of these values will be used for pay data calculation # Then some of the calculated pay data is used to further populate the # settings dictionary xl = pd.read_excel('excel/' + case + '/settings.xlsx', sheet_name=None) settings = defaultdict(int) # ## scalars settings.update(f.make_dict_from_columns(xl['scalars'], 'option', 'value')) # PAY TABLES $$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$ xl_pay_path = 'excel/' + case + '/pay_tables.xlsx' # read pay table data from excel file pay_rates = pd.read_excel(xl_pay_path, sheet_name='rates') # read monthly pay hours per job level and job description from excel file pay_hours = pd.read_excel(xl_pay_path, sheet_name='hours') # inputs to determine global sorting master year and # longevity...function parameters # include second set of parameters for enhanced model and set to None # check for not None to alter for enhanced sort... year = settings['pay_table_year_sort'] longevity = settings['pay_table_longevity_sort'] # instantiate dict values to None basic_compen = None full_mth_compen = None part_mth_compen = None job_key_enhan = None job_key_basic = None basic = None enhanced = None job_dict_df = None # numpy unique returns a SORTED array of unique elements contract_years = np.unique(pay_rates.year) settings['contract_end'] = max(contract_years) settings['contract_years'] = contract_years # extract integer column names (represents years of pay longevity) longevity_cols = [] for col in pay_rates.columns.values.tolist(): try: int(col) longevity_cols.append(col) except ValueError: pass table_cols = ['year', 'jnum'] table_cols.extend(longevity_cols) basic = pd.merge(pay_rates, pay_hours) # For enhanced_jobs: enhanced_full = basic.copy() enhanced_part = basic.copy() # SELECTED COLUMNS MULTIPLIED BY A DESIGNATED COLUMN ROW VALUE basic[longevity_cols] = basic[longevity_cols]\ .multiply(basic['basic_hours'], axis="index") # sort by year and job level and only keep columns: 'year', 'jnum', # and all year longevity (integer) columns basic_compen = basic.sort_values(['year', 'jnum'])[table_cols]\ .set_index('year', drop=True) # create small dataframes for furloughed pay data (no pay) fur_rows = pd.DataFrame(0., index=np.arange(len(contract_years)), columns=basic.columns) basic_fur_rows = fur_rows.copy() basic_fur_rows.jnum = basic.jnum.max() + 1 basic_fur_rows.year = contract_years basic_fur_rows.jobstr = 'FUR' # CONCATENATE the furlough pay data to the basic and enhanced pay data basic = pd.concat([basic, basic_fur_rows]) # select a SECTION OF THE PAY DATA TO USE AS A MASTER ORDER # for entire pay dataframe(s). # In other words, the job level order of the entire pay # dataframe will match the selected year and pay longevity # order, even if certain year and pay level compensation # amounts are not in descending order. # The order must be consistent for the data model. order_basic = basic[basic.year == year][['jnum', longevity, 'jobstr']]\ .sort_values(longevity, ascending=False) order_basic['order'] = np.arange(len(order_basic)) + 1 job_key_basic = order_basic[['order', 'jobstr', 'jnum']].copy() # make a dataframe to save the job level hierarchy job_key_basic.set_index('order', drop=True, inplace=True) job_key_basic.rename(columns={'jnum': 'orig_order'}, inplace=True) # this is the way to sort each job level heirarchy for each year. # this dataframe is merged with the 'enhanced' dataframe # then enhanced is sorted by year and order columns order_basic = order_basic.reset_index()[['jnum', 'order']] basic = pd.merge(basic, order_basic).sort_values(['year', 'order'])\ .reset_index(drop=True) basic.jnum = basic.order basic_df = basic[table_cols].copy() # MELT AND INDEX - CREATING INDEXED MONTHLY PAY DATAFRAME(S) melt_basic = pd.melt(basic_df, id_vars=['year', 'jnum'], var_name='scale', value_name='monthly') melt_basic['ptindex'] = (melt_basic.year * 100000 + melt_basic.scale * 100 + melt_basic.jnum) melt_basic.drop(['scale', 'year', 'jnum'], axis=1, inplace=True) melt_basic.sort_values('ptindex', inplace=True) melt_basic.set_index('ptindex', drop=True, inplace=True) melt_basic.to_pickle('dill/pay_table_basic.pkl') # Calculate for enhanced_jobs and write to workbook # ENHANCED JOBS # calculate monthly compensation for each job level and pay longevity enhanced_full[longevity_cols] = enhanced_full[longevity_cols]\ .multiply(enhanced_full['full_hours'], axis="index") enhanced_part[longevity_cols] = enhanced_part[longevity_cols]\ .multiply(enhanced_part['part_hours'], axis="index") # ENHANCED TABLE SUFIXES, COLUMNS, JNUMS(ENHANCED_PART) # make enhanced_part (fewer hours per position per month) # jnums begin with maximum enhanced_full jnum + 1 and # increment upwards enhanced_part.jnum = enhanced_part.jnum + enhanced_part.jnum.max() # sort by year and job level and only keep columns: 'year', 'jnum', # and all year longevity (integer) columns full_mth_compen = enhanced_full.sort_values(['year', 'jnum'])[table_cols]\ .set_index('year', drop=True) part_mth_compen = enhanced_part.sort_values(['year', 'jnum'])[table_cols]\ .set_index('year', drop=True) # add appropriate suffixes to jobstr columns for full # and part enhanced tables full_suf = settings['enhanced_jobs_full_suffix'] part_suf = settings['enhanced_jobs_part_suffix'] enhanced_full.jobstr = enhanced_full.jobstr.astype(str) + full_suf enhanced_part.jobstr = enhanced_part.jobstr.astype(str) + part_suf # CONCATENATE the full and part(-time) enhanced jobs dataframes enhanced = pd.concat([enhanced_full, enhanced_part]) enhan_fur_rows = fur_rows.copy() enhan_fur_rows.jnum = enhanced.jnum.max() + 1 enhan_fur_rows.year = contract_years enhan_fur_rows.jobstr = 'FUR' # CONCATENATE the furlough pay data to the basic and # enhanced pay data enhanced = pd.concat([enhanced, enhan_fur_rows]) # select a SECTION OF THE PAY DATA TO USE AS A MASTER ORDER # for entire pay dataframe(s). order_enhan = \ enhanced[enhanced.year == year][['jnum', longevity, 'jobstr']]\ .sort_values(longevity, ascending=False) order_enhan['order'] = np.arange(len(order_enhan)) + 1 job_key_enhan = order_enhan[['order', 'jobstr', 'jnum']].copy() # make a dataframe to assist with job dictionary construction # (case_specific config file variable 'jd') s = job_key_enhan['jnum'].reset_index(drop=True) jobs = np.arange((s.max() - 1) / 2) + 1 j_cnt = jobs.max() idx_list1 = [] idx_list2 = [] for job_level in jobs: idx_list1.append(s[s == job_level].index[0] + 1) idx_list2.append(s[s == job_level + j_cnt].index[0] + 1) dict_data = (('job', jobs.astype(int)), ('full', idx_list1), ('part', idx_list2), ('jobstr', list(job_key_basic.jobstr[:int(j_cnt)])), ('full_pcnt', list(pay_hours.full_pcnt))) # use of ordered dict preserves column order job_dict_df = pd.DataFrame(data=od(dict_data)).set_index('job', drop=True) # make a dataframe to save the job level hierarchy job_key_enhan.set_index('order', drop=True, inplace=True) job_key_enhan.rename(columns={'jnum': 'concat_order'}, inplace=True) order_enhan = order_enhan.reset_index()[['jnum', 'order']] enhanced = pd.merge(enhanced, order_enhan).sort_values(['year', 'order'])\ .reset_index(drop=True) enhanced.jnum = enhanced.order enhanced_df = enhanced[table_cols].copy() # MELT AND INDEX - CREATING INDEXED MONTHLY PAY DATAFRAME(S) melt_enhan = pd.melt(enhanced_df, id_vars=['year', 'jnum'], var_name='scale', value_name='monthly') melt_enhan['ptindex'] = (melt_enhan.year * 100000 + melt_enhan.scale * 100 + melt_enhan.jnum) melt_enhan.drop(['scale', 'year', 'jnum'], axis=1, inplace=True) melt_enhan.sort_values('ptindex', inplace=True) melt_enhan.set_index('ptindex', drop=True, inplace=True) melt_enhan.to_pickle('dill/pay_table_enhanced.pkl') # WRITE PAY DATA TO EXCEL FILE - WITHIN CASE-NAMED FOLDER # WITHIN THE 'REPORTS' FOLDER path = 'reports/' + case + '/' os.makedirs(path, exist_ok=True) writer = pd.ExcelWriter(path + 'pay_table_data.xlsx') # string to dataframe items for ws_dict dict_items = (('basic (no sort)', basic_compen), ('enhanced full (no sort)', full_mth_compen), ('enhanced part (no sort)', part_mth_compen), ('basic ordered', basic), ('enhanced ordered', enhanced), ('basic job order', job_key_basic), ('enhanced job order', job_key_enhan), ('job dict', job_dict_df)) ws_dict = od(dict_items) # write pay data dataframes to workbook for key, value in ws_dict.items(): try: value.to_excel(writer, key) except: pass writer.save() # $$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$ # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> # dict items from calculated pay data - refactor to eliminate reading file. # just use variables from above... xl_pay = pd.read_excel( 'reports/' + case + '/pay_table_data.xlsx', sheet_name=['basic job order', 'enhanced job order', 'job dict']) df_jd = xl_pay['job dict'] df_jd['list_cols'] = f.make_lists_from_columns( xl_pay['job dict'], ['full', 'part', 'full_pcnt']) settings['jd'] = f.make_dict_from_columns(df_jd, 'job', 'list_cols') if settings['enhanced_jobs']: descr_df = xl_pay['enhanced job order'] else: descr_df = xl_pay['basic job order'] job_strings = list(descr_df.jobstr) settings['job_strs'] = job_strings settings['job_strs_dict'] = od(enumerate(job_strings, 1)) # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> # ADD MORE ITEMS TO SETTINGS DICTIONARY ////////////////////////////////// pay_ex = xl['pay_exceptions'] settings['pay_exceptions'] = dict([(i, [ a, b ]) for i, a, b in zip(pay_ex.year_code, pay_ex.start_date, pay_ex.end_date) ]) settings['ret_incr'] = \ f.make_tuples_from_columns(xl['ret_incr'], ['month_start', 'month_increase'], return_as_list=False, return_dates_as_strings=True, date_cols=['month_start']) # ## init_ret_age settings['init_ret_age'] = settings['init_ret_age_years'] + \ (settings['init_ret_age_months'] * 1 / 12) # ## ret_incr_dict settings['ret_incr_dict'] = od(settings['ret_incr']) # ## ret_age init_ret_age = settings['init_ret_age'] if settings['ret_age_increase']: ret_dict = settings['ret_incr_dict'] ret_age = init_ret_age + sum(ret_dict.values()) * (1 / 12) else: ret_age = init_ret_age settings['ret_age'] = ret_age start_date = pd.to_datetime(settings['starting_date']) # ## imp_month imp_date = settings['implementation_date'] settings['imp_month'] = ((imp_date.year - start_date.year) * 12) - \ (start_date.month - imp_date.month) # ## num_of_job_levels if settings['enhanced_jobs']: settings['num_of_job_levels'] = settings['job_levels_enhanced'] else: settings['num_of_job_levels'] = settings['job_levels_basic'] # ## eg_counts df = xl['job_counts'] filter_cols = [ col for col in df.columns.values.tolist() if str(col).startswith('eg') ] # if user fails to use "eg" prefix with eg numbers and uses integer # headers instead: if not filter_cols: try: filter_cols = [ col for col in df.columns.values.tolist() if type(int(col)) == int ] except ValueError: print('error: eg_counts. Check that job_count worksheet ' + 'headers start with "eg".') df_filt = df[filter_cols] # sort the columns to ensure proper reference order for standalone counts # (in case user input coulumns are not sorted) df_filt.sort_index(axis=1, inplace=True) eg_counts = [] for col in df_filt: eg_counts.append(list(df_filt[col])) settings['eg_counts'] = eg_counts # ## j_changes df = xl['job_changes'] start = list(df.month_start) end = list(df.month_end) jc_set = set() for i in np.arange(len(start)): jc_set = jc_set.union(set(range(start[i], end[i] + 1))) settings['jc_months'] = jc_set df['lister1'] = f.make_lists_from_columns(df, ['month_start', 'month_end']) filter_cols = \ [col for col in df.columns.values.tolist() if col.startswith('eg')] df['lister2'] = f.make_lists_from_columns(df, filter_cols) settings['j_changes'] = f.make_lists_from_columns( df, ['job', 'lister1', 'total_change', 'lister2']) # ## recalls df = xl['recall'] filter_cols = \ [col for col in df.columns.values.tolist() if col.startswith('eg')] df['lister'] = f.make_lists_from_columns(df, filter_cols) settings['recalls'] = f.make_lists_from_columns( df, ['total_monthly', 'lister', 'month_start', 'month_end']) # ## sg_rights df = xl['prex'] # make count ratio condition month range month_start = df.month_start.min() month_end = df.month_end.max() settings['prex_month_range'] = set(range(month_start, month_end + 1)) sg_col_list = ['eg', 'job', 'count', 'month_start', 'month_end'] filter_cols = \ [col for col in df.columns.values.tolist() if col in sg_col_list] settings['sg_rights'] = f.make_lists_from_columns(df, filter_cols) # ## ratio_cond df = xl['ratio_cond'] # make count ratio condition month range month_start = df.month_start.min() month_end = df.month_end.max() settings['ratio_month_range'] = set(range(month_start, month_end + 1)) # make snap_ratio_on_off_dict settings['snap_ratio_on_off_dict'] = \ f.make_dict_from_columns(df, 'basic_job', 'snapshot') df_cols = df.columns.values.tolist() group_cols = [col for col in df_cols if col.startswith('group')] weight_cols = [col for col in df_cols if col.startswith('weight')] for col in group_cols: df[col] = f.make_group_lists(df, col) df['grp_tup'] = f.make_lists_from_columns(df, group_cols, remove_zero_values=False, as_tuples=True) df['wgt_tup'] = f.make_lists_from_columns(df, weight_cols, remove_zero_values=False, as_tuples=False) df = df[['basic_job', 'grp_tup', 'wgt_tup', 'month_start', 'month_end']].copy() cols = [col for col in df if col != 'basic_job'] comb = f.make_lists_from_columns(df, cols) df = pd.DataFrame({'job': df.basic_job, 'data': comb}) settings['ratio_dict'] = f.make_dict_from_columns(df, 'job', 'data') # ## count_ratio_dict df = xl['ratio_count_capped_cond'] # make count ratio condition month range month_start = df.month_start.min() month_end = df.month_end.max() settings['count_ratio_month_range'] = set(range(month_start, month_end + 1)) # make snap_count_on_off_dict settings['snap_count_on_off_dict'] = \ f.make_dict_from_columns(df, 'basic_job', 'snapshot') df_cols = df.columns.values.tolist() group_cols = [col for col in df_cols if col.startswith('group')] weight_cols = [col for col in df_cols if col.startswith('weight')] for col in group_cols: df[col] = f.make_group_lists(df, col) df['grp_tup'] = f.make_lists_from_columns(df, group_cols, remove_zero_values=False, as_tuples=True) df['wgt_tup'] = f.make_lists_from_columns(df, weight_cols, remove_zero_values=False, as_tuples=False) df = df[[ 'basic_job', 'grp_tup', 'wgt_tup', 'cap', 'month_start', 'month_end' ]].copy() cols = [col for col in df if col != 'basic_job'] comb = f.make_lists_from_columns(df, cols) df = pd.DataFrame({'job': df.basic_job, 'data': comb}) settings['count_ratio_dict'] = f.make_dict_from_columns(df, 'job', 'data') # ## p_dict, p_dict_verbose df = xl['proposal_dictionary'] df.short_descr = df.short_descr.astype(str) settings['p_dict'] = f.make_dict_from_columns(df, 'proposal', 'short_descr') settings['p_dict_verbose'] = f.make_dict_from_columns( df, 'proposal', 'long_descr') if settings['enhanced_jobs']: jd = settings['jd'] sg_rights = settings['sg_rights'] # ratio_cond = settings['ratio_cond'] count_dict = settings['count_ratio_dict'] ratio_dict = settings['ratio_dict'] ratio_onoff = settings['snap_ratio_on_off_dict'] count_onoff = settings['snap_count_on_off_dict'] dist_sg = settings['dist_sg'] dist_ratio = settings['dist_ratio'] dist_count = settings['dist_count'] sg_rights, count_dict, ratio_dict, ratio_onoff, count_onoff = \ cv.convert(job_dict=jd, sg_list=sg_rights, count_ratio_dict=count_dict, ratio_dict=ratio_dict, ratio_onoff_dict=ratio_onoff, count_onoff_dict=count_onoff, dist_sg=dist_sg, dist_ratio=dist_ratio, dist_count_ratio=dist_count) settings['sg_rights'] = sg_rights settings['snap_ratio_on_off_dict'] = ratio_onoff settings['snap_count_on_off_dict'] = count_onoff # remove any ratio groups marked with a zero (only may occur with # three or more merging groups) settings['ratio_dict'] = f.remove_zero_groups(ratio_dict) settings['count_ratio_dict'] = f.remove_zero_groups(count_dict) snap_ratio_dict = {} snap_count_dict = {} for job in ratio_dict.keys(): snap_ratio_dict[job] = ratio_dict[job][2] for job in count_dict.keys(): snap_count_dict[job] = count_dict[job][3] settings['snap_ratio_dict'] = snap_ratio_dict settings['snap_count_dict'] = snap_count_dict # /////////////////////////////////////////////////////////////////// # COLOR DICTIONARY~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ color_dict = mcl(num_of_colors=settings['num_of_job_levels'] + 1, return_dict=True) if settings['enhanced_jobs']: df = xl['enhanced_job_colors'] else: df = xl['basic_job_colors'] job_colors = f.make_lists_from_columns(df, ['red', 'green', 'blue', 'alpha']) color_dict['job_colors'] = job_colors # ## eg_colors, lin_reg_colors, lin_reg_colors2, mean_colors short_colors = xl['eg_colors'] settings['egs'] = set(short_colors.eg.values) color_dict['eg_color_dict'] = dict( zip(short_colors.eg, short_colors.eg_colors)) short_cols = [col for col in list(short_colors) if col != 'eg'] short_colors = xl['eg_colors'][short_cols] for col in list(short_colors): color_dict[col] = list(short_colors[col]) # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # ATTRIBUTE DICTIONARY >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> df = xl['attribute_dict'] attribute_dict = dict(zip(df.col_name, df.col_description)) # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> # OLD CODE STARTS HERE: (making pickle files...) ********************* # MASTER FILE: master = pd.read_excel('excel/' + case + '/master.xlsx') master.set_index('empkey', drop=False, inplace=True) master['retdate'] = master['dob'] + \ pd.DateOffset(years=settings['init_ret_age_years']) + \ pd.DateOffset(months=settings['init_ret_age_months']) # calculate future retirement age increase(s) if settings['ret_age_increase']: ret_incr_dict = settings['ret_incr_dict'] for date, add_months in ret_incr_dict.items(): master.loc[master.retdate > pd.to_datetime(date) + pd.offsets.MonthEnd(-1), 'retdate'] = \ master.retdate + pd.DateOffset(months=add_months) # only include employees who retire during or after the starting_month # (remove employees who retire prior to analysis period) master = master[master.retdate >= start_date - pd.DateOffset(months=1) + pd.DateOffset(days=1)] master.to_pickle('dill/master.pkl') # ACTIVE EACH MONTH (no consideration for job changes or recall, only # calculated on retirements of active employees as of start date) emps_to_calc = master[master.line == 1].copy() cmonths = f.career_months(emps_to_calc, settings['starting_date']) # LIST ORDER PROPOSALS # Read the list ordering proposals from an Excel workbook, add an index # column ('idx'), and store each proposal as a dataframe in a pickled file. # The proposals are contained on separate worksheets. # The routine below will loop through the worksheets. # The worksheet tab names are important for the function. # The pickle files will be named like the workbook sheet names. xl = pd.ExcelFile('excel/' + case + '/proposals.xlsx') sheets = xl.sheet_names # make dataframe containing proposal names and store it # (will be utilized by load_datasets function) sheets_df = pd.DataFrame(sheets, columns=['proposals']) sheets_df.to_pickle('dill/proposal_names.pkl') for ws in sheets: try: df = xl.parse(ws)[['empkey']] df.set_index('empkey', inplace=True) df['idx'] = np.arange(len(df)).astype(int) + 1 df.to_pickle('dill/p_' + ws + '.pkl') except: print('proposal worksheet', ws, 'skipped during processing') continue # LAST MONTH # percent of month for all employee retirement dates. # Used for retirement month pay. df_dates = master[['retdate']].copy() df_dates['day_of_month'] = df_dates.retdate.dt.day df_dates['days_in_month'] = (df_dates.retdate + pd.offsets.MonthEnd(0)).dt.day df_dates['last_pay'] = df_dates.day_of_month.values / \ df_dates.days_in_month.values df_dates.set_index('retdate', inplace=True) df_dates = df_dates[['last_pay']] df_dates.sort_index(inplace=True) df_dates = df_dates[~df_dates.index.duplicated()] df_dates.to_pickle('dill/last_month.pkl') # ******************************************************************** # JOB TABLES AND RELATED DICTIONARY___________________________________ # create job tables (standalone and integrated), store as dictionary # (also job changes and job counts input arrays) # JOB_ASSIGN_FILTER_TABLE 1 master_copy = master[['retdate', 'line', 'fur']].copy() # only active employees... df_actives = master_copy[master_copy.line == 1] # only furloughees... df_fur = master_copy[master_copy.fur == 1] cmonths = f.career_months(df_actives, settings['starting_date']) cmonths_fur = f.career_months(df_fur, settings['starting_date']) active_each_month = f.count_per_month(cmonths) fur_left_each_month = f.count_per_month(cmonths_fur) num_of_months = active_each_month.size num_of_job_levels = settings['num_of_job_levels'] if settings['enhanced_jobs']: # use job dictionary(jd) from settings dictionary for conversion eg_counts, j_changes = f.convert_to_enhanced(settings['eg_counts'], settings['j_changes'], settings['jd']) else: eg_counts = settings['eg_counts'] j_changes = settings['j_changes'] # compute job counts array jcnts_arr = f.make_jcnts(eg_counts) s_table = f.job_gain_loss_table(num_of_months, num_of_job_levels, jcnts_arr, j_changes, standalone=True) table = f.job_gain_loss_table(num_of_months, num_of_job_levels, jcnts_arr, j_changes, standalone=False) # JOB_ASSIGN_FILTER_TABLE 2 # this array will contain the number of originally furloughed employees # who remain under the retirement age fur_arr = np.zeros(num_of_months) np.put(fur_arr, np.arange(fur_left_each_month.size), fur_left_each_month) # this array will hold the cumulative furlough recall counts recall_arr = np.zeros(num_of_months) # loop through each recall schedule and make an array of of cumulative # recall counts for recall in settings['recalls']: recall_add = np.zeros(num_of_months) np.put(recall_add, np.arange(recall[2], recall[3]), recall[0]) np.cumsum(recall_add, out=recall_add) # add this recall cumsum to main recall_arr (for each recall schedule) recall_arr = recall_arr + recall_add # limit each months cumulative recall count if monthly count of remaining # furloughed employees is less additive_arr = np.minimum(fur_arr, recall_arr) # add 2 zero columns in front of job count table zero_table = f.add_zero_col(f.add_zero_col(table[0])) # create accumulative table of job counts, left to right for comparison accum_table = np.add.accumulate(zero_table, axis=1) # create employee count limit array to compare with cumulative job counts if settings['recall']: limit_arr = (active_each_month + additive_arr).astype(int) else: limit_arr = active_each_month.astype(int) limit_arr = limit_arr[:, None] # perform a truth test on accum_table, False results will cause job # loop(s) for a month to be skipped with the assign_standalone_job_changes # function loop_check = np.less_equal(accum_table, limit_arr) table_dict = { 's_table': s_table, 'table': table, 'j_changes': j_changes, 'jcnts_arr': jcnts_arr, 'loop_check': loop_check } # ___________________________________________________________________ # SQUEEZE_VALS ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ # initial values for editor tool widgets. # The values stored within this file will be replaced and # updated by the editor tool when it is utilized. # initial max range for edit slider edit_max = len(df_actives) + len(df_fur) x_low = int(.35 * edit_max) x_high = int(.65 * edit_max) editor_dict = { 'base_ds_name': '', 'box_line_color': 'black', 'box_line_alpha': '.8', 'box_fill_color': 'black', 'box_fill_alpha': '.05', 'box_line_width': '1.0', 'case': case, 'chk_color_apply': [0], 'chk_display': [0], 'chk_filter': [1], 'chk_hover_sel': [], 'chk_hover_on': [], 'chk_minor_grid': [], 'chk_scatter': True, 'chk_poly_fit': False, 'chk_trails': [], 'chk_mean': False, 'chk_sagov': False, 'cht_xsize': 1200, 'cht_ysize': 580, 'cht_xflipped': False, 'cht_yflipped': False, 'cht_title': 'spcnt', 'cht_xformat': '0', 'cht_yformat': '0.0%', 'edit_max': edit_max, 'ez_end': edit_max, 'ez_step': 5, 'minor_grid_alpha': 0.0, 'num_of_months': num_of_months, 'p2_marker_alpha': .8, 'p2_marker_size': 2.2, 'sel_base': 'standalone', 'sel_bgc': 'White', 'sel_bgc_alpha': '.10', 'sel_cond': 'none', 'sel_emp_grp': '1', 'sel_filt1': '', 'sel_filt2': '', 'sel_filt3': '', 'sel_gridc': 'Gray', 'sel_gridc_alpha': '.20', 'sel_measure': 'spcnt', 'sel_proposal': 'edit', 'sel_mth_oper': '>=', 'sel_mth_num': '0', 'sel_oper1': '==', 'sel_oper2': '==', 'sel_oper3': '==', 'sel_sqz_dir': '<< d', 'sel_sqz_type': 'log', 'sel_xtype': 'prop_s', 'sel_ytype': 'diff', 'slider_squeeze': 100, 'total_count': edit_max, 'txt_input1': '', 'txt_input2': '', 'txt_input3': '', 'x_high': x_high, 'x_low': x_low } with open('dill/editor_dict.pkl', 'wb') as handle: pickle.dump(editor_dict, handle, protocol=pickle.HIGHEST_PROTOCOL) # ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ # WRITE DICTIONARIES TO DISC ========================================== with open('dill/dict_settings.pkl', 'wb') as handle: pickle.dump(settings, handle, protocol=pickle.HIGHEST_PROTOCOL) with open('dill/dict_color.pkl', 'wb') as handle: pickle.dump(color_dict, handle, protocol=pickle.HIGHEST_PROTOCOL) with open('dill/dict_attr.pkl', 'wb') as handle: pickle.dump(attribute_dict, handle, protocol=pickle.HIGHEST_PROTOCOL) with open('dill/dict_job_tables.pkl', 'wb') as handle: pickle.dump(table_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)
# insert stovepipe job result into new column of proposal (month_form) # this indexes the jobs with empkeys (orig_jobs is an ndarray only) df_proposal['orig_job'] = orig_jobs # ASSIGN JOBS - flush and no flush option* # cmonths - career length in months for each employee. # length is equal to number of employees cmonths = f.career_months_df_in(df_proposal) # nonret_each_month: count of non-retired employees remaining # in each month until no more remain - # length is equal to longest career length nonret_each_month = f.count_per_month(cmonths) all_months = np.sum(nonret_each_month) cumulative = nonret_each_month.cumsum() np_low_limits = f.make_lower_slice_limits(cumulative) job_level_counts = np.array(jcnts_arr[1]) if cf.delayed_implementation: imp_month = cf.imp_month imp_low = np_low_limits[imp_month] imp_high = cumulative[imp_month] dstand = pd.read_pickle(stand_path_string) ds_option = dstand[['job_count', 'lspcnt', 'spcnt', 'rank_in_job', 'jobp']] dstand = dstand[['mnum', 'jnum', 'empkey', 'fur']][:imp_high]
def main(): script, proposal_name, *conditions = argv pre, suf = 'dill/', '.pkl' skeleton_path_string = (pre + 'skeleton' + suf) proposal_order_string = (pre + 'p_' + proposal_name + suf) stand_path_string = (pre + 'standalone' + suf) output_name = 'ds_' + proposal_name try: df_master = pd.read_pickle(pre + 'master' + suf) except OSError: print('Master list not found. Run build_program_files script?') print('\n >>> exiting routine.\n') exit() try: ds = pd.read_pickle(skeleton_path_string) except OSError: print('\nSkeleton file not found. ' + 'Run build_program_files script?\n\n' + 'Standalone build failed.\n\n' + ' >>> exiting routine.\n') exit() try: df_order = pd.read_pickle(proposal_order_string) except OSError: prop_names = \ pd.read_pickle('dill/proposal_names.pkl').proposals.tolist() stored_case = pd.read_pickle('dill/case_dill.pkl').case.value print('\nerror : proposal name "' + str(proposal_name) + '" not found...\n') print('available proposal names are ', prop_names, 'for case study:', stored_case) print('\n >>> exiting routine.\n') exit() sdict = pd.read_pickle('dill/dict_settings.pkl') tdict = pd.read_pickle('dill/dict_job_tables.pkl') # do not include inactive employees (other than furlough) in data model df_master = df_master[ (df_master.line == 1) | (df_master.fur == 1)].copy() num_of_job_levels = sdict['num_of_job_levels'] lspcnt_calc = sdict['lspcnt_calc_on_remaining_population'] # ORDER the skeleton df according to INTEGRATED list order. # df_skel can initially be in any integrated order, each employee # group must be in proper order relative to itself. # Use the short-form 'idx' (order) column from either the proposed # list or the new_order column from an edited list to create a new column, # 'new_order', within the long-form df_skel. The new order column # is created by data alignment using the common empkey indexes. # The skeleton may then be sorted by month and new_order. # (note: duplicate df_skel empkey index empkeys (from different months) # are assigned the same order value) if 'edit' in conditions: df_new_order = pd.read_pickle(proposal_order_string) ds['new_order'] = df_new_order['new_order'] dataset_path_string = (pre + 'ds_edit' + suf) else: try: order_key = df_order.idx except: order_key = df_order.new_order ds['new_order'] = order_key dataset_path_string = (pre + output_name + suf) if os.path.isdir('dill/'): try: os.remove(dataset_path_string) except OSError: pass # sort the skeleton by month and proposed list order ds.sort_values(['mnum', 'new_order'], inplace=True) # ORIG_JOB* eg_sequence = df_master.eg.values fur_sequence = df_master.fur.values # create list of employee group codes from the master data egs = sorted(pd.unique(eg_sequence)) # retrieve job counts array jcnts_arr = tdict['jcnts_arr'] if 'prex' in conditions: sg_rights = sdict['sg_rights'] sg_eg_list = [] sg_dict = od() stove_dict = od() # Find the employee groups which have pre-existing job rights... # grab the eg code from each sg (special group) job right description # and add to sg_eg_list for line_item in sg_rights: sg_eg_list.append(line_item[0]) # place unique eg codes into sorted list sg_eg_list = sorted(pd.unique(sg_eg_list)) # Make a dictionary containing the special group data for each # group with special rights for eg in sg_eg_list: sg_data = [] for line_item in sg_rights: if line_item[0] == eg: sg_data.append(line_item) sg_dict[eg] = sg_data for eg in egs: if eg in sg_eg_list: # (run prex stovepipe routine with eg dict key and value) sg = df_master[df_master.eg == eg]['sg'].values fur = df_master[df_master.eg == eg]['fur'] ojob_array = f.make_stovepipe_prex_shortform( jcnts_arr[0][eg - 1], sg, sg_dict[eg], fur) prex_stove = np.take(ojob_array, np.where(fur == 0)[0]) stove_dict[eg] = prex_stove else: # (run make_stovepipe routine with eg dict key and value) stove_dict[eg] = f.make_stovepipe_jobs_from_jobs_arr( jcnts_arr[0][eg - 1]) # use dict values as inputs to sp_arr, # ordered dict maintains proper sequence... sp_arr = list(np.array(list(stove_dict.values()))) # total of jobs per eg eg_job_counts = np.add.reduce(jcnts_arr[0], axis=1) orig_jobs = f.make_intgrtd_from_sep_stove_lists(sp_arr, eg_sequence, fur_sequence, eg_job_counts, num_of_job_levels) else: orig_jobs = f.make_original_jobs_from_counts( jcnts_arr[0], eg_sequence, fur_sequence, num_of_job_levels).astype(int) # insert stovepipe job result into new column of proposal (month_form) # this indexes the jobs with empkeys (orig_jobs is an ndarray only) df_master['orig_job'] = orig_jobs # ASSIGN JOBS - flush and no flush option* # cmonths - career length in months for each employee. # length is equal to number of employees cmonths = f.career_months(df_master, sdict['starting_date']) # nonret_each_month: count of non-retired employees remaining # in each month until no more remain - # length is equal to longest career length nonret_each_month = f.count_per_month(cmonths) all_months = np.sum(nonret_each_month) high_limits = nonret_each_month.cumsum() low_limits = f.make_lower_slice_limits(high_limits) if sdict['delayed_implementation']: imp_month = sdict['imp_month'] imp_low = low_limits[imp_month] imp_high = high_limits[imp_month] # read the standalone dataset (info is not in integrated order) ds_stand = pd.read_pickle(stand_path_string) # get standalone data and order it the same as the integrated dataset. # create a unique key column in the standalone data df and a temporary # df which is ordered according to the integrated dataset imp_cols, arr_dict, col_array = \ f.make_preimp_array(ds_stand, ds, imp_high, sdict['compute_job_category_order'], sdict['compute_pay_measures']) # select columns to use as pre-implementation data for integrated # dataset data is limited to the pre-implementation months # aligned_jnums and aligned_fur arrays are the same as standalone data # up to the end of the implementation month, then the standalone value # for the implementation month is passed down unchanged for the # remainder of months in the model. These arrays carry over # standalone data for each employee group to be honored until and when # the integrated list is implemented. # These values from the standalone datasets (furlough status and # standalone job held at the implementation date) are needed for # subsequent integrated dataset job assignment calculations. Other # standalone values are simply copied and inserted into the # pre-implementation months of the integrated dataset. delayed_jnums = col_array[arr_dict['jnum']] delayed_fur = col_array[arr_dict['fur']] aligned_jnums = f.align_fill_down(imp_low, imp_high, ds[[]], # indexed with empkeys delayed_jnums) aligned_fur = f.align_fill_down(imp_low, imp_high, ds[[]], delayed_fur) # now assign "filled-down" job numbers to numpy array delayed_jnums[imp_low:] = aligned_jnums[imp_low:] delayed_fur[imp_low:] = aligned_fur[imp_low:] # ORIG_JOB and FUR (delayed implementation) # then assign numpy array values to orig_job column of integrated # dataset as starting point for integrated job assignments ds['orig_job'] = delayed_jnums ds['fur'] = delayed_fur if sdict['integrated_counts_preimp']: # assign combined job counts prior to the implementation date. # (otherwise, separate employee group counts will be used when # data is transferred from col_array at end of script) # NOTE: this data is the actual number of jobs held within each # category; could be less than the number of jobs available as # attrition occurs standalone_preimp_job_counts = \ f.make_delayed_job_counts(imp_month, delayed_jnums, low_limits, high_limits) col_array[arr_dict['job_count']][:imp_high] = \ standalone_preimp_job_counts else: # set implementation month at zero for job assignment routine imp_month = 0 # ORIG_JOB and FUR (no delayed implementation) # transfer proposal stovepipe jobs (month_form) to long_form via index # (empkey) alignment... ds['orig_job'] = df_master['orig_job'] # developer note: test to verify this is not instantiated elsewhere... ds['fur'] = df_master['fur'] table = tdict['table'] j_changes = tdict['j_changes'] reduction_months = f.get_job_reduction_months(j_changes) # copy selected columns from ds for job assignment function input below. # note: if delayed implementation, the 'fur' and 'orig_job' columns # contain standalone data through the implementation month. df_align = ds[['eg', 'sg', 'fur', 'orig_job']].copy() # JNUM, FUR, JOB_COUNT if sdict['no_bump']: # No bump, no flush option (includes conditions, furlough/recall, # job changes schedules) # this is the main job assignment function. It loops through all of # the months in the model and assigns jobs nbnf, job_count, fur = \ f.assign_jobs_nbnf_job_changes(df_align, low_limits, high_limits, all_months, reduction_months, imp_month, conditions, sdict, tdict, fur_return=sdict['recall']) ds['jnum'] = nbnf ds['job_count'] = job_count ds['fur'] = fur # for create_snum_and_spcnt_arrays function input... jnum_jobs = nbnf else: # Full flush and bump option (no conditions or # furlough/recall schedulue considered, job changes are included) # No bump, no flush applied up to implementation date fbff, job_count, fur = f.assign_jobs_full_flush_job_changes( nonret_each_month, table[0], num_of_job_levels) ds['jnum'] = fbff ds['job_count'] = job_count ds['fur'] = fur # for create_snum_and_spcnt_arrays function input... jnum_jobs = fbff # SNUM, SPCNT, LNUM, LSPCNT monthly_job_counts = table[1] ds['snum'], ds['spcnt'], ds['lnum'], ds['lspcnt'] = \ f.create_snum_and_spcnt_arrays(jnum_jobs, num_of_job_levels, nonret_each_month, monthly_job_counts, lspcnt_calc) # RANK in JOB ds['rank_in_job'] = ds.groupby(['mnum', 'jnum'], sort=False).cumcount() + 1 # JOBP jpcnt = (ds.rank_in_job / ds.job_count).values np.put(jpcnt, np.where(jpcnt == 1.0)[0], .99999) ds['jobp'] = ds['jnum'] + jpcnt # PAY - merge with pay table - provides monthly pay if sdict['compute_pay_measures']: # account for furlough time (only count active months) if sdict['discount_longev_for_fur']: # skel(ds) provides pre-calculated non-discounted scale data # flip ones and zeros... ds['non_fur'] = 1 - ds.fur.values non_fur = ds.groupby([pd.Grouper('empkey')])['non_fur'] \ .cumsum().values ds.pop('non_fur') starting_mlong = ds.s_lmonths.values cum_active_months = non_fur + starting_mlong ds['mlong'] = cum_active_months ds['ylong'] = ds['mlong'].values / 12 ds['scale'] = np.clip((cum_active_months / 12) + 1, 1, sdict['top_of_scale']).astype(int) # make a new long_form dataframe and assign a combination of # pay-related ds columns from large dataset as its index... # the dataframe is empty - we are only making an index-alignment # vehicle to use with indexed pay table.... # the dataframe index contains specific scale, job, and contract year # for each line in long_form ds df_pt_index = pd.DataFrame(index=((ds['scale'].values * 100) + ds['jnum'].values + (ds['year'].values * 100000))) if sdict['enhanced_jobs']: df_pt = pd.read_pickle('dill/pay_table_enhanced.pkl') else: df_pt = pd.read_pickle('dill/pay_table_basic.pkl') # 'data-align' small indexed pay_table to long_form df: df_pt_index['monthly'] = df_pt['monthly'] ds['monthly'] = df_pt_index.monthly.values # MPAY # adjust monthly pay for any raise and last month pay percent if # applicable ds['mpay'] = ((ds['pay_raise'].values * ds['mth_pcnt'].values * ds['monthly'].values)) / 1000 ds.pop('monthly') # CPAY ds['cpay'] = ds.groupby('new_order')['mpay'].cumsum() if sdict['delayed_implementation']: ds_cols = ds.columns # grab each imp_col (column to insert standalone or pre-implementation # date data) and replace integrated data up through implementation # date for col in imp_cols: if col in ds_cols: arr = ds[col].values arr[:imp_high] = col_array[arr_dict[col]][:imp_high] ds[col] = arr # CAT_ORDER # global job ranking if sdict['compute_job_category_order']: ds['cat_order'] = f.make_cat_order(ds, table[0]) # save to file if sdict['save_to_pickle']: ds.to_pickle(dataset_path_string)
ds_list = [ds1, ds2, ds3] short_ds_list = [short_ds1, short_ds2, short_ds3] for i in range(len(ds_list)): df_long = ds_list[i] df_short = short_ds_list[i] jcnts = jcnts_arr[0][i] # jcnts = np.take(jcnts, np.where(jcnts != 0)[0]) short_len = len(short_ds_list[i]) # ORIG_JOB* cmonths_this_ds = f.career_months_df_in(df_short) this_ds_nonret_each_month = f.count_per_month(cmonths_this_ds) uppers = this_ds_nonret_each_month.cumsum() lowers = f.make_lower_slice_limits(uppers) all_months = np.sum(this_ds_nonret_each_month) this_table = table[0][i] this_month_counts = table[1][i] df_align = df_long[['twa', 'fur']] fur_codes = np.array(df_align.fur) # if i == 0 and cf.apply_supc: # i == 0 >> eg1 from skeleton # twa_rights = np.array(cf.twa_rights) # twa_jobs = np.transpose(twa_rights)[1] # sup_c_counts = np.transpose(twa_rights)[2] # twa_dict = dict(zip(twa_jobs, sup_c_counts))
def main(): script, *conditions = argv input_skel = 'skeleton' pre, suf = 'dill/', '.pkl' skeleton_path_string = (pre + input_skel + suf) try: ds = pd.read_pickle(skeleton_path_string) except OSError: print('\nSkeleton file not found. ' + 'Run build_program_files script?\n\n' + 'Standalone build failed.\n\n' + ' >>> exiting routine.\n') exit() if os.path.isdir('dill/'): try: os.remove('dill/standalone.pkl') except OSError: pass sdict = pd.read_pickle('dill/dict_settings.pkl') tdict = pd.read_pickle('dill/dict_job_tables.pkl') num_of_job_levels = sdict['num_of_job_levels'] egs = np.unique(ds.eg) start_month = 0 # make prex True or False # (for input to assign_standalone_job_changes function) prex = 'prex' in conditions table = tdict['s_table'] jcnts_arr = tdict['jcnts_arr'] j_changes = tdict['j_changes'] job_change_months = f.get_job_change_months(j_changes) job_reduction_months = f.get_job_reduction_months(j_changes) # sort the skeleton by employee group, month, and index # (preserves each group's list order) ds.sort_values(['eg', 'mnum', 'idx']) ds_dict = {} short_ds_dict = {} for grp in egs: ds_dict[grp] = ds[ds.eg == grp].copy() for grp in egs: short_ds_dict[grp] = ds_dict[grp][ds_dict[grp].mnum == 0].copy() ds = pd.DataFrame() for eg in egs: df_long = ds_dict[eg] df_short = short_ds_dict[eg] jcnts = jcnts_arr[0][eg - 1] short_len = len(df_short) # ORIG_JOB* cmonths_this_ds = \ f.career_months(df_short, sdict['starting_date']) this_ds_nonret_each_month = f.count_per_month(cmonths_this_ds) high_limits = this_ds_nonret_each_month.cumsum() low_limits = f.make_lower_slice_limits(high_limits) all_months = np.sum(this_ds_nonret_each_month) this_eg_table = f.add_zero_col(table[0][eg - 1]) this_eg_month_counts = table[1][eg - 1] df_align_cols = ['fur'] if 'sg' in df_long: df_align_cols.append('sg') df_align = df_long[df_align_cols] # pre-existing employee group special job assignment is included within # the job assignment function below... results = f.assign_standalone_job_changes(eg, df_align, low_limits, high_limits, all_months, this_eg_table, this_eg_month_counts, this_ds_nonret_each_month, job_change_months, job_reduction_months, start_month, sdict, tdict, apply_sg_cond=prex) jnums = results[0] count_col = results[1] held = results[2] fur = results[3] orig_jobs = results[4] # HELD JOB # job from previous month df_long['held'] = held # JOB_COUNT df_long['job_count'] = count_col # ORIG_JOB df_short['orig_job'] = orig_jobs df_long['orig_job'] = df_short['orig_job'] # ASSIGN JOBS - (stovepipe method only since only # assigning within each employee group separately) # JNUM df_long['jnum'] = jnums # SNUM, SPCNT, LNUM, LSPCNT monthly_job_counts = table[1][eg - 1] lspcnt_calc = sdict['lspcnt_calc_on_remaining_population'] df_long['snum'], df_long['spcnt'], \ df_long['lnum'], df_long['lspcnt'] = \ f.create_snum_and_spcnt_arrays(jnums, num_of_job_levels, this_ds_nonret_each_month, monthly_job_counts, lspcnt_calc) # RANK in JOB df_long['rank_in_job'] = \ df_long.groupby(['mnum', 'jnum']).cumcount() + 1 # JOBP # make last percentage position in each job category .99999 vs 1.0 # so that jobp calculations are correct jpcnt = (df_long.rank_in_job / df_long.job_count).values np.put(jpcnt, np.where(jpcnt == 1.0)[0], .99999) df_long['jobp'] = df_long['jnum'] + jpcnt # PAY - merge with pay table - provides monthly pay if sdict['compute_pay_measures']: if sdict['discount_longev_for_fur']: # skel provides non-discounted scale data # flip ones and zeros... df_long['non_fur'] = 1 - fur df_long['fur'] = fur non_fur = \ (df_long.groupby([pd.Grouper('empkey')]) ['non_fur'].cumsum().values) df_long.pop('non_fur') starting_mlong = df_long.s_lmonths.values cum_active_months = non_fur + starting_mlong df_long['mlong'] = cum_active_months df_long['ylong'] = df_long['mlong'] / 12 df_long['scale'] = \ np.clip((cum_active_months / 12) + 1, 1, sdict['top_of_scale']).astype(int) # SCALE df_pt_index = pd.DataFrame( index=(df_long['scale'] * 100) + df_long['jnum'] + (df_long['year'] * 100000)) if sdict['enhanced_jobs']: df_pt = pd.read_pickle( 'dill/pay_table_enhanced.pkl') else: df_pt = pd.read_pickle( 'dill/pay_table_basic.pkl') df_pt_index['monthly'] = df_pt['monthly'] df_long['monthly'] = df_pt_index.monthly.values # MPAY # adjust monthly pay for any raise and last month pay percent if # applicable df_long['mpay'] = ( (df_long['pay_raise'] * df_long['mth_pcnt'] * df_long['monthly'])) / 1000 df_long.pop('monthly') # CPAY df_long['cpay'] = df_long.groupby('idx')['mpay'].cumsum() ds = pd.concat([ds, df_long], ignore_index=True) ds.sort_values(by=['mnum', 'idx'], inplace=True) ds.set_index('empkey', drop=False, verify_integrity=False, inplace=True) # CAT_ORDER # global job ranking if sdict['compute_job_category_order']: table = tdict['table'] ds['cat_order'] = f.make_cat_order(ds, table[0]) # save to file if sdict['save_to_pickle']: ds.to_pickle('dill/standalone.pkl')
def main(): # read prepared list dataframe - proper column headers, column formats... # this is master.pkl, order-independent, concatenated list data pre, suf = 'dill/', '.pkl' master_list = 'master' master_path = (pre + master_list + suf) try: df_mlist = pd.read_pickle(master_path) except OSError: print('\nMaster list not found. Run build_program_files script?\n\n' + 'Skeleton build failed.\n\n' + ' >>> exiting routine.\n') import sys sys.exit() output_name = 'skeleton' skel_path_string = (pre + output_name + suf) sdict = pd.read_pickle('dill/dict_settings.pkl') # only include pilots that are not retired prior to the starting_month start_date = sdict['starting_date'] df_mlist = df_mlist[ df_mlist.retdate >= start_date - pd.DateOffset(months=1)] # include furloughees by default df = df_mlist[(df_mlist.line == 1) | (df_mlist.fur == 1)].copy() df_mlist = [] # MNUM* # calculate the number of career months for each employee (short_form) # cmonths is used for mnum, idx, and mth_pcnt calculations cmonths = f.career_months(df, start_date) # convert the python cmonths list to a numpy array and # use that array as input for the count_per_month function. # The count_per_month function output array is input for # other functions (month_form) nonret_each_month = f.count_per_month(cmonths) # first long form data generation. # month numbers, same month number repeated for each # month length (long_form) long_form_skeleton = f.gen_month_skeleton(nonret_each_month) # this is making a dataframe out of the # long_form_skeleton (months) created above. # this is the basis for the long_form dataframe... # MNUM # (month number) skel = pd.DataFrame(long_form_skeleton.astype(int), columns=['mnum']) # IDX* # grab emp index for each remaining # employee for each month - used for merging dfs later empkey_arr = df.empkey.values long_index, long_emp = f.gen_skel_emp_idx(nonret_each_month, cmonths, empkey_arr) # IDX skel['idx'] = long_index.astype(int) # EMPKEY skel['empkey'] = long_emp.astype(int) # grab retdates from df column (short_form) # used for mth_pcnt and age calc (also mapping retdates) dobs = list(df['dob']) df_last = pd.read_pickle('dill/last_month.pkl') df.set_index('retdate', inplace=True) df['lmonth_pcnt'] = df_last.last_pay df.reset_index(inplace=True) df.set_index('empkey', inplace=True, verify_integrity=False, drop=False) lmonth_pcnt = df.lmonth_pcnt.values df_dict = {'mth_pcnt': lmonth_pcnt, 'final_month': cmonths} df_last_month = pd.DataFrame(df_dict) df_last_month['idx'] = df_last_month.index df_last_month.set_index(['idx', 'final_month'], inplace=True) skel = pd.merge(skel, df_last_month, right_index=True, left_on=['idx', 'mnum'], how='outer') # MTH_PCNT skel['mth_pcnt'] = skel.mth_pcnt.fillna(1) # DATE, YEAR, PAY RAISE* # set up date_range - end of month dates df_dates = pd.DataFrame(pd.date_range(start_date, periods=len(nonret_each_month), freq='M'), columns=['date']) # this function produces a 2-column array. # First column is the year value of the date list passed as an input. # The second column is either 1.0 or # a calculated percentage pay raise after the last contract year. if sdict['compute_pay_measures']: df_dates = f.contract_year_and_raise(df_dates, sdict) # the merge below brings in 3 columns - date, year, and pay_raise # - from month_form to long_form # DATE, YEAR, PAY RAISE skel = pd.merge(skel, df_dates, right_index=True, left_on=['mnum']) # AGE, SCALE* # calculate and assign starting age and # starting longevity. # Assign to columns in df and then data align merge into skeleton df. # These columns are used later for age and scale calculations. # Merged here so that they could be done together # after setting indexes to match. s_age = f.starting_age(dobs, start_date) df['s_age'] = s_age # data alignment magic...set index to empkey skel.set_index('empkey', inplace=True, verify_integrity=False, drop=False) # AGE, RETDATE, EG, DOH, LDATE, LNAME, # FUR, RET_MONTH to long_form skeleton skel['s_age'] = df.s_age skel['fur'] = df.fur if sdict['add_eg_col']: skel['eg'] = df.eg if sdict['add_retdate_col']: skel['retdate'] = df.retdate if sdict['add_doh_col']: skel['doh'] = df.doh if sdict['add_ldate_col']: skel['ldate'] = df.ldate if sdict['add_lname_col']: skel['lname'] = df.lname if sdict['add_line_col']: skel['line'] = df.line if sdict['add_sg_col']: skel['sg'] = df.sg # RET_MARK # add last month number to df df['ret_month'] = cmonths # data align to long-form skel skel['ret_mark'] = df.ret_month mnums = skel.mnum.values lmonth_arr = np.zeros(mnums.size).astype(int) ret_month = skel.ret_mark.values # mark array where retirement month is equal to month number np.put(lmonth_arr, np.where(ret_month == mnums)[0], 1) skel['ret_mark'] = lmonth_arr # SCALE* if sdict['compute_pay_measures']: df['s_lyears'] = f.longevity_at_startdate(list(df['ldate']), start_date) skel['s_lyears'] = df.s_lyears month_inc = (1 / 12) # scale is payrate longevity level # compute scale for each employee for each month # begin with s_lyears (starting longevity years) # add a monthly increment based on the month number (mnum) # convert to an integer which rounds toward zero # clip to min of 1 and max of top_of_scale (max pay longevity scale) skel['scale'] = np.clip(((skel['mnum'] * month_inc) + skel['s_lyears']).astype(int), 1, sdict['top_of_scale']) skel.pop('s_lyears') # this column is only used for calculating furloughed employee pay # longevity in compute_measures routine. # ...could be an option if recalls are not part of model df['s_lmonths'] = f.longevity_at_startdate(list(df['ldate']), sdict['starting_date'], return_as_months=True) skel['s_lmonths'] = df.s_lmonths # AGE # calculate monthly age using starting age and month number age_list = skel.s_age.values corr_ages = f.age_correction(long_form_skeleton, age_list, sdict['ret_age']) if sdict['ret_age_increase']: skel['age'] = f.clip_ret_ages(sdict['ret_incr_dict'], sdict['init_ret_age'], skel.date.values, corr_ages) else: skel['age'] = corr_ages skel.pop('s_age') # empkey index (keep empkey column) # this is for easy data alignment with different list order keys # save results to pickle if sdict['save_to_pickle']: skel.to_pickle(skel_path_string)
def main(): script, *conditions = argv input_skel = 'skeleton' pre, suf = 'dill/', '.pkl' skeleton_path_string = (pre + input_skel + suf) try: ds = pd.read_pickle(skeleton_path_string) except OSError: print('\nSkeleton file not found. ' + 'Run build_program_files script?\n\n' + 'Standalone build failed.\n\n' + ' >>> exiting routine.\n') exit() if os.path.isdir('dill/'): try: os.remove('dill/standalone.pkl') except OSError: pass sdict = pd.read_pickle('dill/dict_settings.pkl') tdict = pd.read_pickle('dill/dict_job_tables.pkl') num_of_job_levels = sdict['num_of_job_levels'] egs = np.unique(ds.eg) start_month = 0 # make prex True or False # (for input to assign_standalone_job_changes function) prex = 'prex' in conditions table = tdict['s_table'] jcnts_arr = tdict['jcnts_arr'] j_changes = tdict['j_changes'] job_change_months = f.get_job_change_months(j_changes) job_reduction_months = f.get_job_reduction_months(j_changes) # sort the skeleton by employee group, month, and index # (preserves each group's list order) ds.sort_values(['eg', 'mnum', 'idx']) ds_dict = {} short_ds_dict = {} for grp in egs: ds_dict[grp] = ds[ds.eg == grp].copy() for grp in egs: short_ds_dict[grp] = ds_dict[grp][ds_dict[grp].mnum == 0].copy() ds = pd.DataFrame() for eg in egs: df_long = ds_dict[eg] df_short = short_ds_dict[eg] jcnts = jcnts_arr[0][eg - 1] short_len = len(df_short) # ORIG_JOB* cmonths_this_ds = \ f.career_months(df_short, sdict['starting_date']) this_ds_nonret_each_month = f.count_per_month(cmonths_this_ds) high_limits = this_ds_nonret_each_month.cumsum() low_limits = f.make_lower_slice_limits(high_limits) all_months = np.sum(this_ds_nonret_each_month) this_eg_table = f.add_zero_col(table[0][eg - 1]) this_eg_month_counts = table[1][eg - 1] df_align_cols = ['fur'] if 'sg' in df_long: df_align_cols.append('sg') df_align = df_long[df_align_cols] # pre-existing employee group special job assignment is included within # the job assignment function below... results = f.assign_standalone_job_changes(eg, df_align, low_limits, high_limits, all_months, this_eg_table, this_eg_month_counts, this_ds_nonret_each_month, job_change_months, job_reduction_months, start_month, sdict, tdict, apply_sg_cond=prex) jnums = results[0] count_col = results[1] held = results[2] fur = results[3] orig_jobs = results[4] # HELD JOB # job from previous month df_long['held'] = held # JOB_COUNT df_long['job_count'] = count_col # ORIG_JOB df_short['orig_job'] = orig_jobs df_long['orig_job'] = df_short['orig_job'] # ASSIGN JOBS - (stovepipe method only since only # assigning within each employee group separately) # JNUM df_long['jnum'] = jnums # SNUM, SPCNT, LNUM, LSPCNT monthly_job_counts = table[1][eg - 1] lspcnt_calc = sdict['lspcnt_calc_on_remaining_population'] df_long['snum'], df_long['spcnt'], \ df_long['lnum'], df_long['lspcnt'] = \ f.create_snum_and_spcnt_arrays(jnums, num_of_job_levels, this_ds_nonret_each_month, monthly_job_counts, lspcnt_calc) # RANK in JOB df_long['rank_in_job'] = \ df_long.groupby(['mnum', 'jnum']).cumcount() + 1 # JOBP # make last percentage position in each job category .99999 vs 1.0 # so that jobp calculations are correct jpcnt = (df_long.rank_in_job / df_long.job_count).values np.put(jpcnt, np.where(jpcnt == 1.0)[0], .99999) df_long['jobp'] = df_long['jnum'] + jpcnt # PAY - merge with pay table - provides monthly pay if sdict['compute_pay_measures']: if sdict['discount_longev_for_fur']: # skel provides non-discounted scale data # flip ones and zeros... df_long['non_fur'] = 1 - fur df_long['fur'] = fur non_fur = \ (df_long.groupby([pd.Grouper('empkey')]) ['non_fur'].cumsum().values) df_long.pop('non_fur') starting_mlong = df_long.s_lmonths.values cum_active_months = non_fur + starting_mlong df_long['mlong'] = cum_active_months df_long['ylong'] = df_long['mlong'] / 12 df_long['scale'] = \ np.clip((cum_active_months / 12) + 1, 1, sdict['top_of_scale']).astype(int) # SCALE df_pt_index = pd.DataFrame(index=(df_long['scale'] * 100) + df_long['jnum'] + (df_long['year'] * 100000)) if sdict['enhanced_jobs']: df_pt = pd.read_pickle('dill/pay_table_enhanced.pkl') else: df_pt = pd.read_pickle('dill/pay_table_basic.pkl') df_pt_index['monthly'] = df_pt['monthly'] df_long['monthly'] = df_pt_index.monthly.values # MPAY # adjust monthly pay for any raise and last month pay percent if # applicable df_long['mpay'] = ((df_long['pay_raise'] * df_long['mth_pcnt'] * df_long['monthly'])) / 1000 df_long.pop('monthly') # CPAY df_long['cpay'] = df_long.groupby('idx')['mpay'].cumsum() ds = pd.concat([ds, df_long], ignore_index=True) ds.sort_values(by=['mnum', 'idx'], inplace=True) ds.set_index('empkey', drop=False, verify_integrity=False, inplace=True) # CAT_ORDER # global job ranking if sdict['compute_job_category_order']: table = tdict['table'] ds['cat_order'] = f.make_cat_order(ds, table[0]) # save to file if sdict['save_to_pickle']: ds.to_pickle('dill/standalone.pkl')
for grp in egs: short_ds_dict[grp] = ds_dict[grp][ds_dict[grp].mnum == 0].copy() ds = pd.DataFrame() for i in egs - 1: df_long = ds_dict[i + 1] df_short = short_ds_dict[i + 1] jcnts = jcnts_arr[0][i] short_len = len(df_short) # ORIG_JOB* cmonths_this_ds = f.career_months_df_in(df_short) this_ds_nonret_each_month = f.count_per_month(cmonths_this_ds) uppers = this_ds_nonret_each_month.cumsum() lowers = f.make_lower_slice_limits(uppers) all_months = np.sum(this_ds_nonret_each_month) this_table = table[0][i] this_month_counts = table[1][i] df_align_cols = ['fur'] if 'sg' in df_long: df_align_cols.append('sg') df_align = df_long[df_align_cols] fur_codes = np.array(df_align.fur) # pre-existing employee group special job assignment is included within
def main(): # read prepared list dataframe - proper column headers, column formats... # this is master.pkl, order-independent, concatenated list data pre, suf = 'dill/', '.pkl' master_list = 'master' master_path = (pre + master_list + suf) try: df_mlist = pd.read_pickle(master_path) except OSError: print('\nMaster list not found. Run build_program_files script?\n\n' + 'Skeleton build failed.\n\n' + ' >>> exiting routine.\n') import sys sys.exit() output_name = 'skeleton' skel_path_string = (pre + output_name + suf) sdict = pd.read_pickle('dill/dict_settings.pkl') # only include pilots that are not retired prior to the starting_month start_date = sdict['starting_date'] df_mlist = df_mlist[df_mlist.retdate >= start_date - pd.DateOffset(months=1)] # include furloughees by default df = df_mlist[(df_mlist.line == 1) | (df_mlist.fur == 1)].copy() df_mlist = [] # MNUM* # calculate the number of career months for each employee (short_form) # cmonths is used for mnum, idx, and mth_pcnt calculations cmonths = f.career_months(df, start_date) # convert the python cmonths list to a numpy array and # use that array as input for the count_per_month function. # The count_per_month function output array is input for # other functions (month_form) nonret_each_month = f.count_per_month(cmonths) # first long form data generation. # month numbers, same month number repeated for each # month length (long_form) long_form_skeleton = f.gen_month_skeleton(nonret_each_month) # this is making a dataframe out of the # long_form_skeleton (months) created above. # this is the basis for the long_form dataframe... # MNUM # (month number) skel = pd.DataFrame(long_form_skeleton.astype(int), columns=['mnum']) # IDX* # grab emp index for each remaining # employee for each month - used for merging dfs later empkey_arr = df.empkey.values long_index, long_emp = f.gen_skel_emp_idx(nonret_each_month, cmonths, empkey_arr) # IDX skel['idx'] = long_index.astype(int) # EMPKEY skel['empkey'] = long_emp.astype(int) # grab retdates from df column (short_form) # used for mth_pcnt and age calc (also mapping retdates) dobs = list(df['dob']) df_last = pd.read_pickle('dill/last_month.pkl') df.set_index('retdate', inplace=True) df['lmonth_pcnt'] = df_last.last_pay df.reset_index(inplace=True) df.set_index('empkey', inplace=True, verify_integrity=False, drop=False) lmonth_pcnt = df.lmonth_pcnt.values df_dict = {'mth_pcnt': lmonth_pcnt, 'final_month': cmonths} df_last_month = pd.DataFrame(df_dict) df_last_month['idx'] = df_last_month.index df_last_month.set_index(['idx', 'final_month'], inplace=True) skel = pd.merge(skel, df_last_month, right_index=True, left_on=['idx', 'mnum'], how='outer') # MTH_PCNT skel['mth_pcnt'] = skel.mth_pcnt.fillna(1) # DATE, YEAR, PAY RAISE* # set up date_range - end of month dates df_dates = pd.DataFrame(pd.date_range(start_date, periods=len(nonret_each_month), freq='M'), columns=['date']) # this function produces a 2-column array. # First column is the year value of the date list passed as an input. # The second column is either 1.0 or # a calculated percentage pay raise after the last contract year. if sdict['compute_pay_measures']: df_dates = f.contract_year_and_raise(df_dates, sdict) # the merge below brings in 3 columns - date, year, and pay_raise # - from month_form to long_form # DATE, YEAR, PAY RAISE skel = pd.merge(skel, df_dates, right_index=True, left_on=['mnum']) # AGE, SCALE* # calculate and assign starting age and # starting longevity. # Assign to columns in df and then data align merge into skeleton df. # These columns are used later for age and scale calculations. # Merged here so that they could be done together # after setting indexes to match. s_age = f.starting_age(dobs, start_date) df['s_age'] = s_age # data alignment magic...set index to empkey skel.set_index('empkey', inplace=True, verify_integrity=False, drop=False) # AGE, RETDATE, EG, DOH, LDATE, LNAME, # FUR, RET_MONTH to long_form skeleton skel['s_age'] = df.s_age skel['fur'] = df.fur if sdict['add_eg_col']: skel['eg'] = df.eg if sdict['add_retdate_col']: skel['retdate'] = df.retdate if sdict['add_doh_col']: skel['doh'] = df.doh if sdict['add_ldate_col']: skel['ldate'] = df.ldate if sdict['add_lname_col']: skel['lname'] = df.lname if sdict['add_line_col']: skel['line'] = df.line if sdict['add_sg_col']: skel['sg'] = df.sg # RET_MARK # add last month number to df df['ret_month'] = cmonths # data align to long-form skel skel['ret_mark'] = df.ret_month mnums = skel.mnum.values lmonth_arr = np.zeros(mnums.size).astype(int) ret_month = skel.ret_mark.values # mark array where retirement month is equal to month number np.put(lmonth_arr, np.where(ret_month == mnums)[0], 1) skel['ret_mark'] = lmonth_arr # SCALE* if sdict['compute_pay_measures']: df['s_lyears'] = f.longevity_at_startdate(list(df['ldate']), start_date) skel['s_lyears'] = df.s_lyears month_inc = (1 / 12) # scale is payrate longevity level # compute scale for each employee for each month # begin with s_lyears (starting longevity years) # add a monthly increment based on the month number (mnum) # convert to an integer which rounds toward zero # clip to min of 1 and max of top_of_scale (max pay longevity scale) skel['scale'] = np.clip( ((skel['mnum'] * month_inc) + skel['s_lyears']).astype(int), 1, sdict['top_of_scale']) skel.pop('s_lyears') # this column is only used for calculating furloughed employee pay # longevity in compute_measures routine. # ...could be an option if recalls are not part of model df['s_lmonths'] = f.longevity_at_startdate(list(df['ldate']), sdict['starting_date'], return_as_months=True) skel['s_lmonths'] = df.s_lmonths # AGE # calculate monthly age using starting age and month number age_list = skel.s_age.values corr_ages = f.age_correction(long_form_skeleton, age_list, sdict['ret_age']) if sdict['ret_age_increase']: skel['age'] = f.clip_ret_ages(sdict['ret_incr_dict'], sdict['init_ret_age'], skel.date.values, corr_ages) else: skel['age'] = corr_ages skel.pop('s_age') # empkey index (keep empkey column) # this is for easy data alignment with different list order keys # save results to pickle if sdict['save_to_pickle']: skel.to_pickle(skel_path_string)