def make_inter_level_hierarchical_transition_matrixes_tables(person_year_table, profession, out_dir): """ This function spits out two .csv's per profession, where one CSV contains the transition matrices for all observed years except the left and right censors (since we judge entry and departure based on anterior and posterior year to focal year X) and the other shows the transition PROBABILITY matrices for the same years. :param person_year_table: a table of person years, as a list of lists :param profession: string, "judges", "prosecutors", "notaries" or "executori". :param out_dir: str, the path to where the transition matrices will live :return: None """ global_trans_dict = inter_level_transition_matrices(person_year_table, profession) with open(out_dir + 'yearly_count_hierarchical_transition_matrices.csv', 'w') as out_ct, \ open(out_dir + 'yearly_count_hierarchical_probability_transition_matrices.csv', 'w') as out_pb: count_writer, prob_writer = csv.writer(out_ct), csv.writer(out_pb) count_writer.writerow([profession]), count_writer.writerow([]), prob_writer.writerow([profession]), prob_writer.writerow([]), for yr in global_trans_dict: count_writer.writerow([yr]), prob_writer.writerow([yr]) for lvl in global_trans_dict[yr]: count_row = [str(key) + ' : ' + str(value) for key, value in global_trans_dict[yr][lvl].items()] count_writer.writerow(count_row) level_sum_key = str(lvl) + '-' + "level_sum" level_sum = global_trans_dict[yr][lvl][level_sum_key] prob_row = [str(key) + ' : ' + str(round(helpers.weird_division(value, level_sum), 4)) for key, value in global_trans_dict[yr][lvl].items()] prob_writer.writerow(prob_row) count_writer.writerow([]), prob_writer.writerow([])
def freq_mat_to_prob_mat(frequency_matrix, round_to=15): """ Take a matrix of frequencies and turn it into a probability matrix, where each cell is divided by its row sum. NB: leaves zero rows as they are are :param frequency_matrix: list of lists, e.g. [[1,2,], [3,4]] :param round_to: int, to how many decimals we want to round; default is fifteen :return list of lists, where rows sum to 1, e.g. [[0.25, 0.75], [0.9, 0.1]] """ probability_matrix = [] for i in range(len(frequency_matrix)): row_sum = sum(frequency_matrix[i]) prob_row = [round(helpers.weird_division(round(cell, 5), row_sum), round_to) for cell in frequency_matrix[i]] probability_matrix.append(prob_row) return probability_matrix
def adjusted_retirement_counts(person_year_table, profession, weights=False): """ The problem with the raw sample count of retirement is that it does not distinguish between people who genuinely leave the profession and those who simply leave the sample (i.e. move to another area) but remain in the profession. Consequently, raw sample retirement counts are biased upwards because profession-exits and sample-exits are implicitly equated. The solution is to use the total population to compute the fraction of retirements from the sample area that are genuine departures from the profession and then to multiply the raw sample retirement count by that fraction, thereby reducing the upward bias. To be exact, the genuine retirement fraction is computed by genuine retirement fraction = genuine retirement counts / (genuine retirement counts + sample-leaving counts) and the adjusted retirement count will therefore be adjusted number of retirements = raw sample retirement count * genuine retirement fraction :param person_year_table: list of lists, a list of person-years (each one a list of values) :param profession: string, "judges", "prosecutors", "notaries" or "executori" :param weights: bool, if True then instead of returning the adusted counts, return the fractions by which we weigh the observed counts in order to reduce bias :return a nested dict, where 1st layer keys are year, 2nd layer keys are level in the judicial hierarchy, and base values are the adjusted retirement counts """ samp_yrs, samp_as, fracts_yrs = samp_yr_range[profession], samp_areas[profession], pop_yrs_for_fracts[profession] yr_col_idx = helpers.get_header(profession, 'preprocess').index('an') ca_cod_idx = helpers.get_header(profession, 'preprocess').index('ca cod') lvl_col_idx = helpers.get_header(profession, 'preprocess').index('nivel') # sort the population table by person and year then sample from it by area sorted_person_year_table = helpers.sort_pers_yr_table_by_pers_then_yr(person_year_table, profession) # initialise the dicts; NB: four possible levels, even though level 3 (Appeals courts) only began in 1993 ret_fracts = {lvl: {"gen_rets": 0, "samp_leaves": 0} for lvl in range(1, 5)} people = helpers.group_table_by_persons(sorted_person_year_table, profession) for person in people: for idx, pers_yr in enumerate(person): current_yr, current_lvl, current_area = pers_yr[yr_col_idx], int(pers_yr[lvl_col_idx]), pers_yr[ca_cod_idx] # if this year is used for the fraction, and within the sampling areas if int(current_yr) in fracts_yrs and current_area in samp_as: if idx < len(person) - 1: # since we do look-aheads to see departures-cum-retirements # if next year's area is NOT within the sampling area, increment sample departures if person[idx + 1][ca_cod_idx] not in samp_as: ret_fracts[current_lvl]["samp_leaves"] += 1 # if last year is used for the fraction and within the sampling areas, increment genuine retirements else: # NB: this always assume we pick a sampling year than is less than the right censoring year ret_fracts[current_lvl]["gen_rets"] += 1 # average over the years then get the final fraction, per level for lvl in ret_fracts: avg_gen_rets = float(ret_fracts[lvl]["gen_rets"]) / float(len(fracts_yrs)) avg_samp_leave_rets = float(ret_fracts[lvl]["samp_leaves"]) / float(len(fracts_yrs)) ret_fracts[lvl] = helpers.weird_division(avg_gen_rets, (avg_gen_rets + avg_samp_leave_rets), mult_const=True) # get the raw counts cas_sample_table = sample.appellate_area_sample(sorted_person_year_table, profession, samp_as) samp_ret_counts = totals_in_out.pop_cohort_counts(cas_sample_table, samp_yrs[0], samp_yrs[1], profession, cohorts=True, unit_type="nivel", entry=False) samp_ret_counts.pop("grand_total") # don't need the grand total # and weigh them; round result to four decimals for lvl in samp_ret_counts: for yr in samp_ret_counts[lvl]: samp_ret_counts[lvl][yr] = round(samp_ret_counts[lvl][yr]["total_size"] * ret_fracts[int(lvl)], 4) if weights: return ret_fracts else: return samp_ret_counts
def adjusted_lateral_transfer_counts(person_year_table, profession, weights=False): """ The problem with the raw sample count of lateral trasnfers is that it is biased downward, for two reasons: a) those who trasnfer laterally to a position outside the sample will appear have retired, thus biasing the lateral transfer count downward b) those who entered the sample via lateral transfer from outside the sample will appear to be new entrants, thus biasing the lateral transfer count downward Essentially, the sample only counts lateral transfers that occur within the sample, ignoring those lateral transfers that feature sample entry or departure. To fix this bias we use the total populating to compute the genuine fraction of lateral transfers, namely genuine promotion ratio = (within-sample lateral transfers + lateral transfers leaving the sample + lateral transfers entering the sample) / within-sample lateral transfers and the adjusted lateral transfer count will therefore be adjusted number of lateral transfers = within-sample lateral transfer count * genuine lateral transfer ratio :param person_year_table: list of lists, a list of person-years (each one a list of values) :param profession: string, "judges", "prosecutors", "notaries" or "executori" :param weights: bool, if True then instead of returning the adusted counts, return the fractions by which we weigh the observed counts in order to reduce bias :return a nested dict, where 1st layer keys are year, 2nd layer keys are level in the judicial hierarchy, and base values are the adjusted lateral transfer counts """ samp_yrs, samp_as, fracts_yrs = samp_yr_range[profession], samp_areas[profession], pop_yrs_for_fracts[profession] yr_col_idx = helpers.get_header(profession, 'preprocess').index('an') ca_cod_idx = helpers.get_header(profession, 'preprocess').index('ca cod') lvl_col_idx = helpers.get_header(profession, 'preprocess').index('nivel') # sort the population table by person and year then sample from it by area sorted_person_year_table = helpers.sort_pers_yr_table_by_pers_then_yr(person_year_table, profession) # initialise the dicts; NB: four possible levels, even though level 3 (Appeals courts) only began in 1993 trans_fracts = {lvl: {"within_samp_transfs": 0, "samp_leave_transfs": 0, "samp_ent_transfs": 0} for lvl in range(1, 5)} people = helpers.group_table_by_persons(sorted_person_year_table, profession) for person in people: for idx, pers_yr in enumerate(person): current_yr, current_lvl, current_area = pers_yr[yr_col_idx], int(pers_yr[lvl_col_idx]), pers_yr[ca_cod_idx] # if this year is used for the fraction and this year is within the sample area if int(current_yr) in fracts_yrs and current_area in samp_as: if idx < len(person) - 1: # since we do look-aheads to judge mobility within or leaving the sample # if current hierarchical level is equal to NEXT year's AND the exact workplaces differ # (i.e. there's a lateral transfer this year): if current_lvl == int(person[idx + 1][lvl_col_idx]) and \ get_workplace_code(pers_yr, profession) != get_workplace_code(person[idx + 1], profession): # if next year's area is outside the sample, increment count of leaving-sample transfers if person[idx + 1][ca_cod_idx] not in samp_as: trans_fracts[current_lvl]["samp_leave_transfs"] += 1 else: # if next year's area is within the sample, increment the count of within-sample demotions if person[idx + 1][ca_cod_idx] in samp_as: trans_fracts[current_lvl]["within_samp_transfs"] += 1 if 1 < idx: # we do look behinds to see if someone entered the sample from elsewhere: # if LAST year's hierarchical level was the same as this year's AND the exact workplaces different # (i.e. a lateral transfer occurred last year) if int(person[idx - 1][lvl_col_idx]) == current_lvl and \ get_workplace_code(pers_yr, profession) != get_workplace_code(person[idx - 1], profession): # if last year's area was not within the sample, increment the count of extra-sample entries via # lateral transfer trans_fracts[current_lvl]["samp_ent_transfs"] += 1 # average over the years then get the final fraction, per level for lvl in trans_fracts: avg_within_samp_transfs = float(trans_fracts[lvl]["within_samp_transfs"]) / float(len(fracts_yrs)) avg_samp_leave_transfs = float(trans_fracts[lvl]["samp_leave_transfs"]) / float(len(fracts_yrs)) avg_samp_ent_transfs = float(trans_fracts[lvl]["samp_ent_transfs"]) / float(len(fracts_yrs)) trans_fracts[lvl] = helpers.weird_division((avg_within_samp_transfs + avg_samp_leave_transfs + avg_samp_ent_transfs), avg_within_samp_transfs, mult_const=True) # get the raw counts cas_sample_table = sample.appellate_area_sample(sorted_person_year_table, profession, samp_as) samp_transf_counts = hierarchical.hierarchical_mobility(cas_sample_table, profession) # and weigh them; round result to four decimals for yr in samp_transf_counts: for lvl in samp_transf_counts[yr]: samp_transf_counts[yr][lvl] = round(samp_transf_counts[yr][lvl]["across"]["total"] * trans_fracts[lvl], 4) if weights: return trans_fracts else: return samp_transf_counts
def adjusted_entry_counts(person_year_table, profession, weights=False): """ The problem with the raw sample count of entries is that it does not distinguish between people who are genuinely new recruits to the profession, and those who were already in the profession but outside the sample. Consequently, the raw count is biased upwards because it equates entering the sample from within the profession with entering the profession tout-court. The solution is to use the total population to compute the fraction of entries into the sample that are genuine recruits into the profession and then to multiply the raw sample entry count by that fraction, thereby reducing the upward bias. To be exact, the genuine entry fraction is computed by genuine entry fraction = genuine entry counts / (genuine entry counts + sample-entering counts) and the adjusted entry count will therefore be adjusted number entries = sample entry count * genuine entry fraction :param person_year_table: list of lists, a list of person-years (each one a list of values) :param profession: string, "judges", "prosecutors", "notaries" or "executori" :param weights: bool, if True then instead of returning the adusted counts, return the fractions by which we weigh the observed counts in order to reduce bias :return a nested dict, where 1st layer keys are year, 2nd layer keys are level in the judicial hierarchy, and base values are the adjusted entry counts """ samp_yrs, samp_as, fracts_yrs = samp_yr_range[profession], samp_areas[profession], pop_yrs_for_fracts[profession] yr_col_idx = helpers.get_header(profession, 'preprocess').index('an') ca_cod_idx = helpers.get_header(profession, 'preprocess').index('ca cod') lvl_col_idx = helpers.get_header(profession, 'preprocess').index('nivel') # sort the population table by person and year then sample from it by area sorted_person_year_table = helpers.sort_pers_yr_table_by_pers_then_yr(person_year_table, profession) # initialise the dicts; NB: four possible levels, even though level 3 (Appeals courts) only began in 1993 ent_fracts = {lvl: {"gen_ents": 0, "samp_ents": 0} for lvl in range(1, 5)} people = helpers.group_table_by_persons(sorted_person_year_table, profession) for person in people: for idx, pers_yr in enumerate(person): current_yr, current_lvl, current_area = pers_yr[yr_col_idx], int(pers_yr[lvl_col_idx]), pers_yr[ca_cod_idx] # if this year is used for the fraction and this year is within the sample area if int(current_yr) in fracts_yrs and current_area in samp_as: # if it's genuinely the first year, increment genuine entries # NB: this always assumes that we skip the left censor year if idx == 0: # the first year of the career; ent_fracts[current_lvl]["gen_ents"] += 1 if 1 < idx: # since we do look-behinds to see if someone entered the sample from elsewhere # if LAST year's appellate area is different from this year's appellate area, increment count of # extra-sample entries if current_area != person[idx - 1][ca_cod_idx]: ent_fracts[current_lvl]["samp_ents"] += 1 # average over the years then get the final fraction, per level for lvl in ent_fracts: avg_gen_ents = float(ent_fracts[lvl]["gen_ents"]) / float(len(fracts_yrs)) avg_samp_ents = float(ent_fracts[lvl]["samp_ents"]) / float(len(fracts_yrs)) ent_fracts[lvl] = helpers.weird_division(avg_gen_ents, (avg_gen_ents + avg_samp_ents), mult_const=True) # get the raw counts cas_sample_table = sample.appellate_area_sample(sorted_person_year_table, profession, samp_as) samp_ent_counts = totals_in_out.pop_cohort_counts(cas_sample_table, samp_yrs[0], samp_yrs[1], profession, cohorts=True, unit_type="nivel", entry=True) samp_ent_counts.pop("grand_total") # don't need the grand total # and weigh them; round result to four decimals for lvl in samp_ent_counts: for yr in samp_ent_counts[lvl]: samp_ent_counts[lvl][yr] = round(samp_ent_counts[lvl][yr]["total_size"] * ent_fracts[int(lvl)], 4) if weights: return ent_fracts else: return samp_ent_counts
def yearly_weights(person_year_table, profession, appellate_areas_to_sample, weighting_year): """ Get following weights (as ratios), per year, per level: - retirement / retire + leave area - internal promotion / total promotions - external promotion / total entries All counts are based on comparing the sampled appellate areas to the population in the other appellate areas. NB: these weights pool across sampled areas NB: keys in base-level dicts indicate judicial level: 1 = low court, 2 = tribunal, 3 = appeals, 4 = high court NB: by convention I turn undefined weights (where the denominator is zero) to zero NB: assumes weighting years feature entire population. :param person_year_table: a table of person-years, as a list of lists; comes with NO header :param profession: string, "judges", "prosecutors", "notaries" or "executori" :param appellate_areas_to_sample: list of appellate area codes indicating which areas we sample, e.g. ["CA1, "CA5"] :param weighting_year: year based on which we draw weights. NB: since we measure mobility by comparing this year with adjacted ones (e.g. we know you got promoted because your level in weighting_year is less than your level in weighting_year+1), weighting_year actually signifies an interval. So "2006" refers to mobility in the period 2006-2007. Years are as str, e.g. "2017". :return: dict of yearly weights """ yr_col_idx = helpers.get_header(profession, 'preprocess').index('an') ca_cod_idx = helpers.get_header(profession, 'preprocess').index('ca cod') lvl_col_idx = helpers.get_header(profession, 'preprocess').index('nivel') pid_col_idx = helpers.get_header(profession, 'preprocess').index('cod persoană') # make the dicts that hold mobility counts per level lvls_dict = {"1": 0, "2": 0, "3": 0} total_retirements, total_area_leaves = deepcopy(lvls_dict), deepcopy( lvls_dict) total_promotions, internal_promotions = deepcopy(lvls_dict), deepcopy( lvls_dict) total_entries, external_promotions = deepcopy(lvls_dict), deepcopy( lvls_dict) # group table by persons person_year_table = sorted(person_year_table, key=operator.itemgetter(pid_col_idx, yr_col_idx)) pid_col_idx = helpers.get_header(profession, 'preprocess').index('cod persoană') people = [ person for k, [*person] in groupby(person_year_table, key=operator.itemgetter(pid_col_idx)) ] # iterate through people for person in people: # iterate through person-years for idx, pers_yr in enumerate(person): if idx < 1: # for the first year of the career; NB: this always assumes that we skip the left censor year # if first year is sampling year, and the person-year is in the sampling areas if pers_yr[yr_col_idx] == weighting_year and pers_yr[ ca_cod_idx] in appellate_areas_to_sample: # increment total entries total_entries[pers_yr[lvl_col_idx]] += 1 elif 0 < idx < len( person) - 1: # look up to the second-last person-year # if this year is sampling year, and this person-year is in the sampling areas if pers_yr[yr_col_idx] == weighting_year and pers_yr[ ca_cod_idx] in appellate_areas_to_sample: # if current appellate area is different from next year appellate area, increment total area leaves if pers_yr[ca_cod_idx] != person[idx + 1][ca_cod_idx]: total_area_leaves[pers_yr[lvl_col_idx]] += 1 # if current appellate area is different from last year's appellate area AND # last year's level is lower than this year's level, increment external promotions if pers_yr[ca_cod_idx] != person[idx + 1][ca_cod_idx] \ and person[idx - 1][lvl_col_idx] < pers_yr[lvl_col_idx]: external_promotions[pers_yr[lvl_col_idx]] += 1 # if this year's level is lower than next year's level, increment total promotions if pers_yr[lvl_col_idx] < person[idx + 1][lvl_col_idx]: total_promotions[pers_yr[lvl_col_idx]] += 1 # if this year's level is lower than next year's # AND this year's appellate area is the same as next years, increment internal promotions if pers_yr[ca_cod_idx] == person[idx + 1][ca_cod_idx]: internal_promotions[pers_yr[lvl_col_idx]] += 1 else: # we're in the last year, i.e. the retirement year # NB: this always assume we pick a sampling year than is less than the right censoring year # if last year is sampling year and in sampling areas, increment retirements counter if person[-1][yr_col_idx] == weighting_year and person[-1][ ca_cod_idx] in appellate_areas_to_sample: total_retirements[person[-1][lvl_col_idx]] += 1 # make retirement weights retirement_weights = {} for key in total_retirements: retirement_weights.update({ key: helpers.weird_division( total_retirements[key], (total_area_leaves[key] + total_retirements[key])) }) # make internal promotion weights internal_promotion_weights = {} for key in total_promotions: internal_promotion_weights.update({ key: helpers.weird_division(internal_promotions[key], total_promotions[key]) }) # make external promotion weights external_promotion_weights = {} for key in total_entries: external_promotion_weights.update({ key: helpers.weird_division(external_promotions[key], total_entries[key]) }) return { "ret_leave": retirement_weights, "int_prom": internal_promotion_weights, "ext_prom": external_promotion_weights }
def retirement_promotion_estimates(person_year_table, profession, sampling_year_range, out_dir): """ Estimate how many people retire and move up the legal hierarchy (i.e. get promoted) every year, both in raw counts and relative to the population of people open to such retirement. Post-2005 we have the complete population of magistrates (i.e. judges and prosecutors) but pre-2005 we have only non-random samples. For judges I sample three appellate areas (Alba, Craiova, Iaşi, and Ploieşti) because I have yearly data on all courts in these areas since at least 1980. That said, mobility estimates from these samples need to be corrected. In particular, I look at three sorts of mobility: retirement, promotion, and entry. Post-2005 we are certain that someone retires when they are in the population in year X, but absent in year X+1. For the pre-2005 we can't be certain, because that person may have left the sample but stayed in the population, i.e. they have simply changed appellate area. I therefore correct sample estimates as follows: - for the intervals 2006-2007, 2007-2008, and 2008-2009, see how many magistrates in the sampled areas (Alba, Craiova, Iaşi, and Ploieşti) actually retired, and how many just left their respective area. Compute the ratio "retirement counts" / "retirement counts + area leaving counts" for each interval, and take the three-interval average. The result is a weight: X% of the people that departed the sampled areas actually retired. There is one ratio for each judicial level (i.e. low court, tribunal, and appeals). - for pre-2005 I count how many people left the sample, then multiply the per-level count by the appropriate weight. Obviously, this assumes that the ratio between retirements and area changes is constant over this period. I cannot numerically check that assumption. Regarding promotion, post-2005 we can just see if someone's judicial level increased between years. Pre-2005 this count will be based in the sample because a) those who receive a promotion outside the sample look show up as retirements, b) those who entered the sample upon promotion look like new entrants. To address this I construct two weights: the ratio of within-area promotions to total promotions, and the ratio of entrants-by-promotion to total entrants (per year, per level). The final count of (weighted) sample promotions is then computed as follows: raw count * 1 / within-total-ratio + count entrants * promotion-entrants-to-total-ratio Finally, to estimate the number of entrants, into the profession using the sample, I do the following: count entrants * (1 - promotion-entrants-to-total-ratio). Again, the assumption is that the relative balance of inter-area mobility flows is constant throughout the period under study, and therefore that ratios derived from 2006-2009 are true of other times as well. I choose the 2006-2009 period because it's a) the earliest population-level data, and b) this period did not feature major judicial reforms. Finally, also want estimates of the total size of the population, and of year-on-year population growth. :param person_year_table: a table of person-years, as a list of lists; NB: asumes no header :param profession: string, "judges", "prosecutors", "notaries" or "executori" :param sampling_year_range: 2-tuple of ints, range of year's for which we're estimating mobility, e.g. (1998-2004) :param out_dir: directory where tables of mobility estimates will live :return: None """ # get handy column indexes yr_col_idx = helpers.get_header(profession, 'preprocess').index('an') pid_col_idx = helpers.get_header(profession, 'preprocess').index('cod persoană') # sort person-year table by person then year person_year_table.sort(key=operator.itemgetter(pid_col_idx, yr_col_idx)) # sample all courts in these appeals regions: Alba (CA1), Craiova (CA7), Iaşi (CA9), Ploieşti (CA12) appellate_areas_to_sample = ["CA1", "CA7", "CA9", "CA12"] cas_sample_table = sample.appellate_area_sample(person_year_table, profession, appellate_areas_to_sample) # get weights for retirement, promotion, and entry # for those appeals areas, for periods 2006-2007 and 2007-2008, per hierarchical level: # a) get ratio of within-area promotions (i.e. people who were already in the area) to total promotions # b) get ratio of retirements to retirements + out-of-area transfers # Average the values for 2006-07 and 2007-08: these will be weights for estimates from earlier years weights = three_year_average_weights(person_year_table, profession, appellate_areas_to_sample, ["2006", "2007", "2008"]) retirement_weights = weights["ret_weight"] internal_promotion_weights = weights["int_prom_weight"] external_promotion_weights = weights["ext_prom_weight"] # get raw counts of entries, retirements and promotions per year, per level, in the desired time-frame counts = get_raw_counts(cas_sample_table, profession, sampling_year_range) ent_counts, ret_counts, prom_counts = counts["entries"], counts[ "retirements"], counts["promotions"] # now weigh those counts with average ratios from 2006-2008. Recall (counts are from sample): # estimated retirements = retirement count * retirement weight # estimated promotions = promotion count * (1 / interior promotion weight) + entry count * external promotion weight # estimated entries = entry count * (1 - external promotion weight) for key in internal_promotion_weights: for year in ret_counts.keys(): # round up since these are whole people ret_counts[year][key] = round( float(ret_counts[year][key]) * retirement_weights[key]) prom_counts[year][key] = round( float( helpers.weird_division(prom_counts[year][key], internal_promotion_weights[key]) + float(ent_counts[year][key]) * external_promotion_weights[key])) ent_counts[year][key] = round( ent_counts[year][key] * (1 - external_promotion_weights[key])) # relabel, strictly for clarity (notice it's not a deepcopy) weighted_ret_counts = ret_counts weighted_prom_counts = prom_counts weighted_ent_counts = ent_counts # using (weighted-estiamted) sample counts, estimate yearly, per-level departure and retirement probabilities, where # denominator is sample count of person-years in year X; also estimate what proportion in each year's sample are # new entrants yearly_counts = counts["total counts"] retire_probs = { year: { "1": 0, "2": 0, "3": 0 } for year in yearly_counts.keys() } promotion_probs = { year: { "1": 0, "2": 0, "3": 0 } for year in yearly_counts.keys() } entry_proportions = { year: { "1": 0, "2": 0, "3": 0 } for year in yearly_counts.keys() } for year in yearly_counts: for lvl in yearly_counts[year]: promotion_probs[year][lvl] = helpers.weird_division( weighted_prom_counts[year][lvl], (yearly_counts[year][lvl])) retire_probs[year][lvl] = helpers.weird_division( weighted_ret_counts[year][lvl], yearly_counts[year][lvl]) # NB: entry proportions is simple: how many of this year's samples are newcomers? entry_proportions[year][lvl] = helpers.weird_division( weighted_ent_counts[year][lvl], yearly_counts[year][lvl]) # estimate the size of the professional population for years for which we only have samples estimated_pop = estimated_population_size(person_year_table, cas_sample_table, profession, sampling_year_range) # estimate year-on-year population growth estimated_pop_growth = estimated_population_growth(estimated_pop, sampling_year_range) # save to disk one table each for retirements, entries, and departures, # and one table for estimated population size and growth with open(out_dir + "retirements.csv", 'w') as out_ret: writer = csv.writer(out_ret) writer.writerow([profession.upper()]) writer.writerow([ "YEAR", "LEVEL", "PROJECTED COUNT RETIREMENTS", "SAMPLE RETIREMENT PROBABILITY" ]) for year in weighted_ret_counts: for lvl in weighted_ret_counts[year]: writer.writerow([ year, lvl, weighted_ret_counts[year][lvl], retire_probs[year][lvl] ]) with open(out_dir + "promotions.csv", 'w') as out_prom: writer = csv.writer(out_prom) writer.writerow([profession.upper()]) writer.writerow([ "YEAR", "LEVEL", "PROJECTED COUNT PROMOTIONS", "SAMPLE PROMOTION PROBABILITY" ]) for year in weighted_prom_counts: for lvl in weighted_prom_counts[year]: if lvl in weighted_prom_counts[ year] and lvl in promotion_probs[year]: writer.writerow([ year, lvl, weighted_prom_counts[year][lvl], promotion_probs[year][lvl] ]) with open(out_dir + "entries.csv", 'w') as out_ent: writer = csv.writer(out_ent) writer.writerow([profession.upper()]) writer.writerow([ "YEAR", "LEVEL", "PROJECTED COUNT ENTRIES", "SAMPLE ENTRY PROPORTIONS" ]) for year in weighted_ent_counts: for lvl in weighted_ent_counts[year]: writer.writerow([ year, lvl, weighted_ent_counts[year][lvl], entry_proportions[year][lvl] ]) with open(out_dir + "growth.csv", 'w') as out_grow: # lol writer = csv.writer(out_grow) writer.writerow([profession.upper()]) writer.writerow([ "YEAR", "PROJECTED POPULATION", "SAMPLE PERCENT GROWTH SINCE PREVIOUS YEAR" ]) for year in estimated_pop: if year == min(sorted(list(estimated_pop.keys())) ): # only know pop growth after second year writer.writerow([year, estimated_pop[year], "NA"]) else: writer.writerow( [year, estimated_pop[year], estimated_pop_growth[year]])