def make_percent_pre_1990_table(person_year_table, profession, out_dir, out_dir_area_samp=None, area_sample=False):
    """
    Make a table that shows for for every given year the percentage of people in the system who entered the system prior
    to 1990. This is meant to show rate of decrease of socialist-era judges. Percentages are disaggregated by judicial
    level, and dump them in a csv table.

    NB: when running this metric on the sample, this function assumes that entries and departures of pre-1990 people
        into the sample balance out, so that the sampling itself doesn't influence the before-to-after 1990 ratio.

    :param person_year_table: a table of person-years, as a list of lists; assumes no header
    :param profession: string, "judges", "prosecutors", "notaries" or "executori".
    :param area_sample: bool, True if you want to exclusively use data from Alba, Iași, Craiova, and Ploiești
                        appeals areas/regions; False by default
    :param out_dir: str, directory where we want the non-area-sampled results table to live
    :param out_dir_area_samp: str, if given it's where we want the sample-area results table to live
    :return None
    """
    if area_sample:
        appellate_areas_to_sample = ["CA1", "CA7", "CA9", "CA12"]  # I hard code this in since it changes very rarely
        person_year_table = sample.appellate_area_sample(person_year_table, profession, appellate_areas_to_sample)
        out_dir = out_dir_area_samp

    # get handy column indexes
    yr_col_idx = helpers.get_header(profession, 'preprocess').index('an')
    pid_col_idx = helpers.get_header(profession, 'preprocess').index('cod persoană')
    lvl_col_idx = helpers.get_header(profession, 'preprocess').index('nivel')

    # sort table by person and year, then group table by persons
    person_year_table = sorted(person_year_table, key=itemgetter(pid_col_idx, yr_col_idx))
    people = [person for k, [*person] in itertools.groupby(person_year_table, key=itemgetter(pid_col_idx))]

    # get the span of years
    years = sorted(list({int(py[yr_col_idx]) for py in person_year_table}))

    # initialise the nested dict holding the data, in three layers: hierarchical levels, list of counts
    b4_1990_year_dict = {i: {yr: {"before 1990": 0, "total count": 0} for yr in years} for i in range(1, 5)}

    for person in people:
        first_career_year = int(person[0][yr_col_idx])
        for pers_yr in person:
            current_year = int(pers_yr[yr_col_idx])
            current_level = int(pers_yr[lvl_col_idx])
            b4_1990_year_dict[current_level][current_year]["total count"] += 1
            if first_career_year <= 1990:
                b4_1990_year_dict[current_level][current_year]["before 1990"] += 1

    # calculate percent from before 1990, only for 1990 and after (before 1990s it's always 100%)
    percs_lvl = {lvl: [] for lvl in b4_1990_year_dict}
    for lvl in b4_1990_year_dict:
        for yr in years:
            if yr >= 1990:
                percs_lvl[lvl].append(helpers.percent(b4_1990_year_dict[lvl][yr]["before 1990"],
                                                      b4_1990_year_dict[lvl][yr]["total count"]))

    # write each level timeseries to disk
    with open(out_dir + "percent_pre_1990.csv", "w") as out_f:
        writer = csv.writer(out_f)
        writer.writerow(["Hierarchical Level"] + [yr for yr in years if yr >= 1990])
        for lvl in b4_1990_year_dict:
            writer.writerow([lvl] + percs_lvl[lvl])
Пример #2
0
def estimated_population_size(person_year_table, profession):
    """
    To estimate the total size of the profession (i.e. of the population) for years in which we only have a sample,
    estimate the ratio between population and samples size for years in which we DO have the whole population, then
    for years with samples only multiply this population inflation ratio by the observed sample size. To be exact

      population inflation ratio = population size / sample size

      estimates population size = sample size * population inflation ratio

    NB: this assumes that the ratio between the total population and the sum of the sampled areas is roughly constant
        across the years whose population sizes we're estimating.

    :param person_year_table: a table of person-years, as a list of lists; assumes no header
    :param profession: string, "judges", "prosecutors", "notaries" or "executori"
    :return: dict of estimated population sizes: keys are years, value are estimates
    """

    samp_yrs, samp_as, ratio_yrs = samp_yr_range[profession], samp_areas[profession], pop_yrs_for_fracts[profession]
    pop_yrs, total_yrs = pop_yr_range[profession], total_range[profession]
    areas_sample_table = sample.appellate_area_sample(person_year_table, profession, samp_as)

    # get ratio of sample size to population size for desired years, average across said years
    samp_size = totals_in_out.pop_cohort_counts(areas_sample_table, total_yrs[0], total_yrs[1], profession,
                                                cohorts=False, unit_type="nivel")
    pop_size = totals_in_out.pop_cohort_counts(person_year_table, pop_yrs[0], pop_yrs[1], profession,
                                               cohorts=False, unit_type="nivel")
    avg_samp_size, avg_total_size = 0, 0
    for r_yr in ratio_yrs:
        avg_samp_size += samp_size["grand_total"][r_yr]["total_size"]
        avg_total_size += pop_size["grand_total"][r_yr]["total_size"]
    avg_samp_size = float(avg_samp_size) / float(len(ratio_yrs))
    avg_total_size = float(avg_total_size) / float(len(ratio_yrs))
    pop_inflation_ratio = avg_total_size / avg_samp_size

    # for each year in which we only have samples, multiply the number of people in sample by the population inflation
    # ratio; these are the population estimates for the years in which we only have samples. Round to 4 decimals.
    estim_pop = {}
    for yr in range(samp_yrs[0], samp_yrs[1] + 1):
        estim_pop.update({yr: round(float(samp_size["grand_total"][yr]["total_size"] * pop_inflation_ratio), 4)})
    return estim_pop
Пример #3
0
def adjusted_retirement_counts(person_year_table, profession, weights=False):
    """
    The problem with the raw sample count of retirement is that it does not distinguish between people who genuinely
    leave the profession and those who simply leave the sample (i.e. move to another area) but remain in the profession.
    Consequently, raw sample retirement counts are biased upwards because profession-exits and sample-exits are
    implicitly equated.

    The solution is to use the total population to compute the fraction of retirements from the sample area that are
    genuine departures from the profession and then to multiply the raw sample retirement count by that fraction,
    thereby reducing the upward bias. To be exact, the genuine retirement fraction is computed by

      genuine retirement fraction = genuine retirement counts / (genuine retirement counts + sample-leaving counts)

    and the adjusted retirement count will therefore be

      adjusted number of retirements = raw sample retirement count * genuine retirement fraction

    :param person_year_table: list of lists, a list of person-years (each one a list of values)
    :param profession: string, "judges", "prosecutors", "notaries" or "executori"
    :param weights: bool, if True then instead of returning the adusted counts, return the fractions by which we weigh
                    the observed counts in order to reduce bias
    :return a nested dict, where 1st layer keys are year, 2nd layer keys are level in the judicial hierarchy, and base
            values are the adjusted retirement counts
    """
    samp_yrs, samp_as, fracts_yrs = samp_yr_range[profession], samp_areas[profession], pop_yrs_for_fracts[profession]

    yr_col_idx = helpers.get_header(profession, 'preprocess').index('an')
    ca_cod_idx = helpers.get_header(profession, 'preprocess').index('ca cod')
    lvl_col_idx = helpers.get_header(profession, 'preprocess').index('nivel')

    # sort the population table by person and year then sample from it by area
    sorted_person_year_table = helpers.sort_pers_yr_table_by_pers_then_yr(person_year_table, profession)

    # initialise the dicts; NB: four possible levels, even though level 3 (Appeals courts) only began in 1993
    ret_fracts = {lvl: {"gen_rets": 0, "samp_leaves": 0} for lvl in range(1, 5)}

    people = helpers.group_table_by_persons(sorted_person_year_table, profession)
    for person in people:
        for idx, pers_yr in enumerate(person):

            current_yr, current_lvl, current_area = pers_yr[yr_col_idx], int(pers_yr[lvl_col_idx]), pers_yr[ca_cod_idx]
            # if this year is used for the fraction, and within the sampling areas
            if int(current_yr) in fracts_yrs and current_area in samp_as:
                if idx < len(person) - 1:  # since we do look-aheads to see departures-cum-retirements
                    # if next year's area is NOT within the sampling area, increment sample departures
                    if person[idx + 1][ca_cod_idx] not in samp_as:
                        ret_fracts[current_lvl]["samp_leaves"] += 1

                # if last year is used for the fraction and within the sampling areas, increment genuine retirements
                else:  # NB: this always assume we pick a sampling year than is less than the right censoring year
                    ret_fracts[current_lvl]["gen_rets"] += 1

    # average over the years then get the final fraction, per level
    for lvl in ret_fracts:
        avg_gen_rets = float(ret_fracts[lvl]["gen_rets"]) / float(len(fracts_yrs))
        avg_samp_leave_rets = float(ret_fracts[lvl]["samp_leaves"]) / float(len(fracts_yrs))
        ret_fracts[lvl] = helpers.weird_division(avg_gen_rets, (avg_gen_rets + avg_samp_leave_rets), mult_const=True)

    # get the raw counts
    cas_sample_table = sample.appellate_area_sample(sorted_person_year_table, profession, samp_as)
    samp_ret_counts = totals_in_out.pop_cohort_counts(cas_sample_table, samp_yrs[0], samp_yrs[1], profession,
                                                      cohorts=True, unit_type="nivel", entry=False)
    samp_ret_counts.pop("grand_total")  # don't need the grand total

    # and weigh them; round result to four decimals
    for lvl in samp_ret_counts:
        for yr in samp_ret_counts[lvl]:
            samp_ret_counts[lvl][yr] = round(samp_ret_counts[lvl][yr]["total_size"] * ret_fracts[int(lvl)], 4)

    if weights:
        return ret_fracts
    else:
        return samp_ret_counts
Пример #4
0
def adjusted_lateral_transfer_counts(person_year_table, profession, weights=False):
    """
    The problem with the raw sample count of lateral trasnfers is that it is biased downward, for two reasons:

     a) those who trasnfer laterally to a position outside the sample will appear have retired, thus biasing the
        lateral transfer count downward

     b) those who entered the sample via lateral transfer from outside the sample will appear to be new entrants, thus
        biasing the lateral transfer count downward

    Essentially, the sample only counts lateral transfers that occur within the sample, ignoring those lateral transfers
     that feature sample entry or departure.

    To fix this bias we use the total populating to compute the genuine fraction of lateral transfers, namely

      genuine promotion ratio = (within-sample lateral transfers +
                                 lateral transfers leaving the sample +
                                 lateral transfers entering the sample)
                                            /
                                   within-sample lateral transfers

    and the adjusted lateral transfer count will therefore be

      adjusted number of lateral transfers = within-sample lateral transfer count * genuine lateral transfer ratio

    :param person_year_table: list of lists, a list of person-years (each one a list of values)
    :param profession: string, "judges", "prosecutors", "notaries" or "executori"
    :param weights: bool, if True then instead of returning the adusted counts, return the fractions by which we weigh
                    the observed counts in order to reduce bias
    :return a nested dict, where 1st layer keys are year, 2nd layer keys are level in the judicial hierarchy, and base
            values are the adjusted lateral transfer counts
    """
    samp_yrs, samp_as, fracts_yrs = samp_yr_range[profession], samp_areas[profession], pop_yrs_for_fracts[profession]

    yr_col_idx = helpers.get_header(profession, 'preprocess').index('an')
    ca_cod_idx = helpers.get_header(profession, 'preprocess').index('ca cod')
    lvl_col_idx = helpers.get_header(profession, 'preprocess').index('nivel')

    # sort the population table by person and year then sample from it by area
    sorted_person_year_table = helpers.sort_pers_yr_table_by_pers_then_yr(person_year_table, profession)

    # initialise the dicts; NB: four possible levels, even though level 3 (Appeals courts) only began in 1993
    trans_fracts = {lvl: {"within_samp_transfs": 0, "samp_leave_transfs": 0, "samp_ent_transfs": 0}
                    for lvl in range(1, 5)}

    people = helpers.group_table_by_persons(sorted_person_year_table, profession)
    for person in people:
        for idx, pers_yr in enumerate(person):

            current_yr, current_lvl, current_area = pers_yr[yr_col_idx], int(pers_yr[lvl_col_idx]), pers_yr[ca_cod_idx]

            # if this year is used for the fraction and this year is within the sample area
            if int(current_yr) in fracts_yrs and current_area in samp_as:

                if idx < len(person) - 1:  # since we do look-aheads to judge mobility within or leaving the sample

                    # if current hierarchical level is equal to NEXT year's AND the exact workplaces differ
                    # (i.e. there's a lateral transfer this year):
                    if current_lvl == int(person[idx + 1][lvl_col_idx]) and \
                            get_workplace_code(pers_yr, profession) != get_workplace_code(person[idx + 1], profession):

                        # if next year's area is outside the sample, increment count of leaving-sample transfers
                        if person[idx + 1][ca_cod_idx] not in samp_as:
                            trans_fracts[current_lvl]["samp_leave_transfs"] += 1

                    else:
                        # if next year's area is within the sample, increment the count of within-sample demotions
                        if person[idx + 1][ca_cod_idx] in samp_as:
                            trans_fracts[current_lvl]["within_samp_transfs"] += 1

                if 1 < idx:  # we do look behinds to see if someone entered the sample from elsewhere:

                    # if LAST year's hierarchical level was the same as this year's AND the exact workplaces different
                    # (i.e. a lateral transfer occurred last year)
                    if int(person[idx - 1][lvl_col_idx]) == current_lvl and \
                            get_workplace_code(pers_yr, profession) != get_workplace_code(person[idx - 1], profession):
                        # if last year's area was not within the sample, increment the count of extra-sample entries via
                        # lateral transfer
                        trans_fracts[current_lvl]["samp_ent_transfs"] += 1

    # average over the years then get the final fraction, per level
    for lvl in trans_fracts:
        avg_within_samp_transfs = float(trans_fracts[lvl]["within_samp_transfs"]) / float(len(fracts_yrs))
        avg_samp_leave_transfs = float(trans_fracts[lvl]["samp_leave_transfs"]) / float(len(fracts_yrs))
        avg_samp_ent_transfs = float(trans_fracts[lvl]["samp_ent_transfs"]) / float(len(fracts_yrs))
        trans_fracts[lvl] = helpers.weird_division((avg_within_samp_transfs +
                                                    avg_samp_leave_transfs +
                                                    avg_samp_ent_transfs),
                                                   avg_within_samp_transfs, mult_const=True)

    # get the raw counts
    cas_sample_table = sample.appellate_area_sample(sorted_person_year_table, profession, samp_as)
    samp_transf_counts = hierarchical.hierarchical_mobility(cas_sample_table, profession)

    # and weigh them; round result to four decimals
    for yr in samp_transf_counts:
        for lvl in samp_transf_counts[yr]:
            samp_transf_counts[yr][lvl] = round(samp_transf_counts[yr][lvl]["across"]["total"] * trans_fracts[lvl], 4)

    if weights:
        return trans_fracts
    else:
        return samp_transf_counts
Пример #5
0
def adjusted_entry_counts(person_year_table, profession, weights=False):
    """
    The problem with the raw sample count of entries is that it does not distinguish between people who are genuinely
    new recruits to the profession, and those who were already in the profession but outside the sample. Consequently,
    the raw count is biased upwards because it equates entering the sample from within the profession with entering
    the profession tout-court.

    The solution is to use the total population to compute the fraction of entries into the sample that are genuine
    recruits into the profession and then to multiply the raw sample entry count by that fraction, thereby reducing the
    upward bias. To be exact, the genuine entry fraction is computed by

      genuine entry fraction = genuine entry counts / (genuine entry counts + sample-entering counts)

    and the adjusted entry count will therefore be

      adjusted number entries = sample entry count * genuine entry fraction

    :param person_year_table: list of lists, a list of person-years (each one a list of values)
    :param profession: string, "judges", "prosecutors", "notaries" or "executori"
    :param weights: bool, if True then instead of returning the adusted counts, return the fractions by which we weigh
                    the observed counts in order to reduce bias
    :return a nested dict, where 1st layer keys are year, 2nd layer keys are level in the judicial hierarchy, and base
            values are the adjusted entry counts
    """
    samp_yrs, samp_as, fracts_yrs = samp_yr_range[profession], samp_areas[profession], pop_yrs_for_fracts[profession]

    yr_col_idx = helpers.get_header(profession, 'preprocess').index('an')
    ca_cod_idx = helpers.get_header(profession, 'preprocess').index('ca cod')
    lvl_col_idx = helpers.get_header(profession, 'preprocess').index('nivel')

    # sort the population table by person and year then sample from it by area
    sorted_person_year_table = helpers.sort_pers_yr_table_by_pers_then_yr(person_year_table, profession)

    # initialise the dicts; NB: four possible levels, even though level 3 (Appeals courts) only began in 1993
    ent_fracts = {lvl: {"gen_ents": 0, "samp_ents": 0} for lvl in range(1, 5)}

    people = helpers.group_table_by_persons(sorted_person_year_table, profession)
    for person in people:
        for idx, pers_yr in enumerate(person):

            current_yr, current_lvl, current_area = pers_yr[yr_col_idx], int(pers_yr[lvl_col_idx]), pers_yr[ca_cod_idx]

            # if this year is used for the fraction and this year is within the sample area
            if int(current_yr) in fracts_yrs and current_area in samp_as:

                # if it's genuinely the first year, increment genuine entries
                #  NB: this always assumes that we skip the left censor year
                if idx == 0:  # the first year of the career;
                    ent_fracts[current_lvl]["gen_ents"] += 1

                if 1 < idx:  # since we do look-behinds to see if someone entered the sample from elsewhere

                    # if LAST year's appellate area is different from this year's appellate area, increment count of
                    # extra-sample entries
                    if current_area != person[idx - 1][ca_cod_idx]:
                        ent_fracts[current_lvl]["samp_ents"] += 1
    # average over the years then get the final fraction, per level
    for lvl in ent_fracts:
        avg_gen_ents = float(ent_fracts[lvl]["gen_ents"]) / float(len(fracts_yrs))
        avg_samp_ents = float(ent_fracts[lvl]["samp_ents"]) / float(len(fracts_yrs))
        ent_fracts[lvl] = helpers.weird_division(avg_gen_ents, (avg_gen_ents + avg_samp_ents), mult_const=True)

    # get the raw counts
    cas_sample_table = sample.appellate_area_sample(sorted_person_year_table, profession, samp_as)
    samp_ent_counts = totals_in_out.pop_cohort_counts(cas_sample_table, samp_yrs[0], samp_yrs[1], profession,
                                                      cohorts=True, unit_type="nivel", entry=True)
    samp_ent_counts.pop("grand_total")  # don't need the grand total
    # and weigh them; round result to four decimals
    for lvl in samp_ent_counts:
        for yr in samp_ent_counts[lvl]:
            samp_ent_counts[lvl][yr] = round(samp_ent_counts[lvl][yr]["total_size"] * ent_fracts[int(lvl)], 4)

    if weights:
        return ent_fracts
    else:
        return samp_ent_counts
Пример #6
0
def avg_career_length(person_year_table, profession, area_sample=True):
    """
    Print out yearly, average, per-level career length, so we can answer questions like "did tribunal (i.e. level 2
    judges) become more experienced, on average, between 1995 and 2005?

    :param person_year_table: a table of person-years, as a list of lists; assumes no header
    :param profession:
    :param area_sample: bool, True if you want to exclusively use data from Alba, Iași, Craiova, and Ploiești
                        appeals areas/regions
    :return: None
    """

    # TODO if using the sample (and not whole population), need to top up estimates to account for the fact that
    #  some people enter from outside the sample, so it might look like it's their first year, but really they've had
    #  a longer career already

    if area_sample:
        appellate_areas_to_sample = ["CA1", "CA7", "CA9", "CA12"]
        person_year_table = sample.appellate_area_sample(
            person_year_table, profession, appellate_areas_to_sample)

    # add a career length count for each person, for each year

    # group table by persons
    yr_col_idx = helpers.get_header(profession, 'preprocess').index('an')
    pid_col_idx = helpers.get_header(profession,
                                     'preprocess').index('cod persoană')
    person_year_table = sorted(person_year_table,
                               key=operator.itemgetter(pid_col_idx,
                                                       yr_col_idx))
    people = [
        person
        for k, [*person] in groupby(person_year_table,
                                    key=operator.itemgetter(pid_col_idx))
    ]

    # make an augmented table where the last year is the career length of that person, in that year
    # NB: by convention we 1-index, i.e. your career length is "1" in the first year for which we observe you
    # the career length column is the last one in the table
    augmented_table = []
    for person in people:
        for idx, pers_yr in enumerate(person):
            augmented_table.append(pers_yr + [idx])

    # for each year, get average career length per level
    years = sorted(list({int(py[yr_col_idx]) for py in augmented_table}))
    year_dict = {year: {"1": [], "2": [], "3": [], "4": []} for year in years}

    # sort and group augmented table by year
    lvl_col_idx = helpers.get_header(profession, 'preprocess').index('nivel')
    augmented_table.sort(key=operator.itemgetter(yr_col_idx))
    years = [
        yr for k, [*yr] in groupby(augmented_table,
                                   key=operator.itemgetter(yr_col_idx))
    ]
    for yr_group in years:
        # recall that a year-group is made of person-years, all sharing the same year, e.g. 1996
        current_year = int(yr_group[0][yr_col_idx])
        # build the per-level person-year lists for each year, in the year_dict
        for pers_yr in yr_group:
            py_lvl = pers_yr[lvl_col_idx]
            year_dict[current_year][py_lvl].append(int(pers_yr[-1]))

    # get the level average for each year
    for yr in year_dict:
        for lvl in year_dict[yr]:
            if year_dict[yr][lvl]:  # need to be careful, no lvl 3 before 1993
                year_dict[yr][lvl] = round(statistics.mean(year_dict[yr][lvl]),
                                           2)

    # print the results
    for yr in year_dict:
        print(yr, ' | ', year_dict[yr])
Пример #7
0
def retirement_promotion_estimates(person_year_table, profession,
                                   sampling_year_range, out_dir):
    """
    Estimate how many people retire and move up the legal hierarchy (i.e. get promoted) every year, both in raw counts
    and relative to the population of people open to such retirement.

    Post-2005 we have the complete population of magistrates (i.e. judges and prosecutors) but pre-2005 we have only
    non-random samples. For judges I sample three appellate areas (Alba, Craiova, Iaşi, and Ploieşti) because I have
    yearly data on all courts in these areas since at least 1980. That said, mobility estimates from these samples
    need to be corrected. In particular, I look at three sorts of mobility: retirement, promotion, and entry.

    Post-2005 we are certain that someone retires when they are in the population in year X, but absent in year X+1.
    For the pre-2005 we can't be certain, because that person may have left the sample but stayed in the population,
    i.e. they have simply changed appellate area. I therefore correct sample estimates as follows:

    - for the intervals 2006-2007, 2007-2008, and 2008-2009, see how many magistrates in the sampled areas (Alba,
      Craiova, Iaşi, and Ploieşti) actually retired, and how many just left their respective area. Compute the ratio
      "retirement counts" / "retirement counts + area leaving counts" for each interval, and take the three-interval
      average. The result is a weight: X% of the people that departed the sampled areas actually retired. There is one
      ratio for each judicial level (i.e. low court, tribunal, and appeals).

    - for pre-2005 I count how many people left the sample, then multiply the per-level count by the appropriate weight.
      Obviously, this assumes that the ratio between retirements and area changes is constant over this period. I cannot
      numerically check that assumption.

    Regarding promotion, post-2005 we can just see if someone's judicial level increased between years. Pre-2005 this
    count will be based in the sample because a) those who receive a promotion outside the sample look show up as
    retirements, b) those who entered the sample upon promotion look like new entrants. To address this I construct two
    weights: the ratio of within-area promotions to total promotions, and the ratio of entrants-by-promotion to total
    entrants (per year, per level).

    The final count of (weighted) sample promotions is then computed as follows:
    raw count * 1 / within-total-ratio  + count entrants * promotion-entrants-to-total-ratio

    Finally, to estimate the number of entrants, into the profession using the sample, I do the following:
    count entrants * (1 - promotion-entrants-to-total-ratio).

    Again, the assumption is that the relative balance of inter-area mobility flows is constant throughout the period
    under study, and therefore that ratios derived from 2006-2009 are true of other times as well. I choose the
    2006-2009 period because it's a) the earliest population-level data, and b) this period did not feature major
    judicial reforms.

    Finally, also want estimates of the total size of the population, and of year-on-year population growth.

    :param person_year_table: a table of person-years, as a list of lists; NB: asumes no header
    :param profession: string, "judges", "prosecutors", "notaries" or "executori"
    :param sampling_year_range: 2-tuple of ints, range of year's for which we're estimating mobility, e.g. (1998-2004)
    :param out_dir: directory where tables of mobility estimates will live
    :return: None
    """

    # get handy column indexes
    yr_col_idx = helpers.get_header(profession, 'preprocess').index('an')
    pid_col_idx = helpers.get_header(profession,
                                     'preprocess').index('cod persoană')

    # sort person-year table by person then year
    person_year_table.sort(key=operator.itemgetter(pid_col_idx, yr_col_idx))

    # sample all courts in these appeals regions: Alba (CA1), Craiova (CA7), Iaşi (CA9), Ploieşti (CA12)
    appellate_areas_to_sample = ["CA1", "CA7", "CA9", "CA12"]
    cas_sample_table = sample.appellate_area_sample(person_year_table,
                                                    profession,
                                                    appellate_areas_to_sample)

    # get weights for retirement, promotion, and entry

    # for those appeals areas, for periods 2006-2007 and 2007-2008, per hierarchical level:
    # a) get ratio of within-area promotions (i.e. people who were already in the area) to total promotions
    # b) get ratio of retirements to retirements + out-of-area transfers
    # Average the values for 2006-07 and 2007-08: these will be weights for estimates from earlier years
    weights = three_year_average_weights(person_year_table, profession,
                                         appellate_areas_to_sample,
                                         ["2006", "2007", "2008"])
    retirement_weights = weights["ret_weight"]
    internal_promotion_weights = weights["int_prom_weight"]
    external_promotion_weights = weights["ext_prom_weight"]

    # get raw counts of entries, retirements and promotions per year, per level, in the desired time-frame
    counts = get_raw_counts(cas_sample_table, profession, sampling_year_range)
    ent_counts, ret_counts, prom_counts = counts["entries"], counts[
        "retirements"], counts["promotions"]
    # now weigh those counts with average ratios from 2006-2008. Recall (counts are from sample):
    # estimated retirements = retirement count * retirement weight
    # estimated promotions = promotion count * (1 / interior promotion weight) + entry count * external promotion weight
    # estimated entries = entry count * (1 - external promotion weight)
    for key in internal_promotion_weights:
        for year in ret_counts.keys():
            # round up since these are whole people
            ret_counts[year][key] = round(
                float(ret_counts[year][key]) * retirement_weights[key])
            prom_counts[year][key] = round(
                float(
                    helpers.weird_division(prom_counts[year][key],
                                           internal_promotion_weights[key]) +
                    float(ent_counts[year][key]) *
                    external_promotion_weights[key]))
            ent_counts[year][key] = round(
                ent_counts[year][key] * (1 - external_promotion_weights[key]))

    # relabel, strictly for clarity (notice it's not a deepcopy)
    weighted_ret_counts = ret_counts
    weighted_prom_counts = prom_counts
    weighted_ent_counts = ent_counts

    # using (weighted-estiamted) sample counts, estimate yearly, per-level departure and retirement probabilities, where
    # denominator is sample count of person-years in year X; also estimate what proportion in each year's sample are
    # new entrants
    yearly_counts = counts["total counts"]

    retire_probs = {
        year: {
            "1": 0,
            "2": 0,
            "3": 0
        }
        for year in yearly_counts.keys()
    }
    promotion_probs = {
        year: {
            "1": 0,
            "2": 0,
            "3": 0
        }
        for year in yearly_counts.keys()
    }
    entry_proportions = {
        year: {
            "1": 0,
            "2": 0,
            "3": 0
        }
        for year in yearly_counts.keys()
    }

    for year in yearly_counts:
        for lvl in yearly_counts[year]:
            promotion_probs[year][lvl] = helpers.weird_division(
                weighted_prom_counts[year][lvl], (yearly_counts[year][lvl]))
            retire_probs[year][lvl] = helpers.weird_division(
                weighted_ret_counts[year][lvl], yearly_counts[year][lvl])
            # NB: entry proportions is simple: how many of this year's samples are newcomers?
            entry_proportions[year][lvl] = helpers.weird_division(
                weighted_ent_counts[year][lvl], yearly_counts[year][lvl])

    # estimate the size of the professional population for years for which we only have samples
    estimated_pop = estimated_population_size(person_year_table,
                                              cas_sample_table, profession,
                                              sampling_year_range)

    # estimate year-on-year population growth
    estimated_pop_growth = estimated_population_growth(estimated_pop,
                                                       sampling_year_range)

    # save to disk one table each for retirements, entries, and departures,
    # and one table for estimated population size and growth
    with open(out_dir + "retirements.csv", 'w') as out_ret:
        writer = csv.writer(out_ret)
        writer.writerow([profession.upper()])
        writer.writerow([
            "YEAR", "LEVEL", "PROJECTED COUNT RETIREMENTS",
            "SAMPLE RETIREMENT PROBABILITY"
        ])
        for year in weighted_ret_counts:
            for lvl in weighted_ret_counts[year]:
                writer.writerow([
                    year, lvl, weighted_ret_counts[year][lvl],
                    retire_probs[year][lvl]
                ])

    with open(out_dir + "promotions.csv", 'w') as out_prom:
        writer = csv.writer(out_prom)
        writer.writerow([profession.upper()])
        writer.writerow([
            "YEAR", "LEVEL", "PROJECTED COUNT PROMOTIONS",
            "SAMPLE PROMOTION PROBABILITY"
        ])
        for year in weighted_prom_counts:
            for lvl in weighted_prom_counts[year]:
                if lvl in weighted_prom_counts[
                        year] and lvl in promotion_probs[year]:
                    writer.writerow([
                        year, lvl, weighted_prom_counts[year][lvl],
                        promotion_probs[year][lvl]
                    ])

    with open(out_dir + "entries.csv", 'w') as out_ent:
        writer = csv.writer(out_ent)
        writer.writerow([profession.upper()])
        writer.writerow([
            "YEAR", "LEVEL", "PROJECTED COUNT ENTRIES",
            "SAMPLE ENTRY PROPORTIONS"
        ])
        for year in weighted_ent_counts:
            for lvl in weighted_ent_counts[year]:
                writer.writerow([
                    year, lvl, weighted_ent_counts[year][lvl],
                    entry_proportions[year][lvl]
                ])

    with open(out_dir + "growth.csv", 'w') as out_grow:  # lol
        writer = csv.writer(out_grow)
        writer.writerow([profession.upper()])
        writer.writerow([
            "YEAR", "PROJECTED POPULATION",
            "SAMPLE PERCENT GROWTH SINCE PREVIOUS YEAR"
        ])
        for year in estimated_pop:
            if year == min(sorted(list(estimated_pop.keys()))
                           ):  # only know pop growth after second year
                writer.writerow([year, estimated_pop[year], "NA"])
            else:
                writer.writerow(
                    [year, estimated_pop[year], estimated_pop_growth[year]])
def make_vacancy_transition_tables(person_year_table, profession, out_dir, years, averaging_years=None, area_samp=False,
                                   out_dir_area_samp=None):
    """
    Make a csv containing one sub-table for each of the years that we select, with each sub-table showing the transition
    probabilites between hiearchical levels of vacancies. Optionally, we may also include a table that averages across
    desired years. e.g. 1984-1989.

    Each sub-table should be NxN+1, where N = number of levels, and the last column represents vacancies leaving the
    system, i.e. people being recruited into the system.

    NB: diagonals signify mobility WITHIN the level

    :param person_year_table: list of lists, a list of person-years (each one a list of values)
    :param profession: string, "judges", "prosecutors", "notaries" or "executori"
    :param out_dir: str, the path to where the transition matrices will live
    :param years: list of ints, the years for which we want vacancy probability transition matrixes
    :param averaging_years: list of ints over which we want to average vacancy frequency tables, e.g. [1985, 1986, 1987]
    :param area_samp: bool,True if we want to sample from specific areas
    :param out_dir_area_samp: if given, str showing the out-directory where we want the vacancy transition tables for
                              the sample areas to live
    :return: None
    """
    averaging_years = averaging_years if averaging_years else []  # if no averaging years provided, make empty list
    sorted_person_year_table = helpers.sort_pers_yr_table_by_pers_then_yr(person_year_table, profession)

    proms_weights, demos_weights, transfs_weights = None, None, None  # throws up errors if things go awry

    # get entry counts, in easy format
    entry_counts = totals_in_out.pop_cohort_counts(sorted_person_year_table, years[0], years[-1], profession,
                                                   cohorts=True, unit_type="nivel", entry=True)
    entry_counts.pop("grand_total")  # don't need the grand total
    for lvl in entry_counts:
        for yr in entry_counts[lvl]:
            entry_counts[lvl][yr] = entry_counts[lvl][yr]["total_size"]

    if area_samp:
        # I hard code these in since they change so rarely
        samp_areas = {"judges": ["CA1", "CA7", "CA9", "CA12", "-88"], "prosecutors": []}
        samp_yr_range = {"judges": [1980, 2003], "prosecutors": []}
        samp_yrs, samp_as = samp_yr_range[profession], samp_areas[profession]

        # get sample-adjusted entry counts and sample weights for mobility
        entry_counts = area_samples.adjusted_entry_counts(person_year_table, profession)
        proms_weights = area_samples.adjusted_promotion_counts(sorted_person_year_table, profession, weights=True)
        demos_weights = area_samples.adjusted_demotion_counts(sorted_person_year_table, profession, weights=True)
        transfs_weights = area_samples.adjusted_lateral_transfer_counts(sorted_person_year_table, profession,
                                                                        weights=True)
        # restrict person-year table to sampling areas
        sorted_person_year_table = sample.appellate_area_sample(sorted_person_year_table, profession, samp_as)
        # redirect the out-directory
        out_dir = out_dir_area_samp

    # get person-level transition frequencies levels
    trans_freqs = inter_level_transition_matrices(sorted_person_year_table, profession)

    with open(out_dir + "vacancy_probability_transition_matrixes.csv", "w") as out_f:
        writer = csv.writer(out_f)

        # this is unused if averaging years stays empty
        avg_vac_trans_mat = np.empty((4, 5), float)

        # for each sampling year
        for yr in years:

            # make array of zeros, for four levels; not all years have four levels, but zero rows/columns are harmless
            trans_mat = np.zeros((4, 4))

            for lvl in range(1, 5):  # for departure levels in the system, i.e. the level FROM which mobility happens
                if lvl in trans_freqs[yr]:  # if the levels exist in that year (since some are added later)

                    # now weigh the observed values
                    # NB: route = mobility route, e.g. "1-2" means "mobility from level 1 to level 2"
                    for route, mob_freq in trans_freqs[yr][lvl].items():

                        # ignore retirements, non-movements, sums, and discontinuities
                        if route.split("-")[1].isdigit():

                            # level you leave and level you go to; -1 since numpy zero indexes
                            departing, arriving = int(route.split("-")[0]) - 1, int(route.split("-")[1]) - 1

                            # get frequency counts and put them in the frequency matrix; if sampling, weigh the counts
                            if departing < arriving:  # promotions
                                trans_mat[departing][arriving] = mob_freq
                                if area_samp:
                                    trans_mat[departing][arriving] = round(mob_freq * proms_weights[lvl], 5)

                            if departing == arriving:  # lateral transfers
                                trans_mat[departing][arriving] = mob_freq
                                if area_samp:
                                    trans_mat[departing][arriving] = round(mob_freq * transfs_weights[lvl], 5)

                            if departing > arriving:  # demotions
                                trans_mat[departing][arriving] = mob_freq
                                if area_samp:
                                    trans_mat[departing][arriving] = round(mob_freq * demos_weights[lvl], 5)

            # transpose the person-level mobility frequency matrix to get the vacancy mobility matrix
            vac_trans_mat = np.transpose(trans_mat)

            # by convention, we thus far treated levels in incrementing order, i.e. level 1 < 2 < 3 < 4. The convention
            # in vacancy chains studies is that 1 > 2 > 3 > 4, and to get that we transpose the array along the
            # anti-diagonal/off-diagonal
            vac_trans_mat = vac_trans_mat[::-1, ::-1].T

            # in the last column we put vacancy "retirements", i.e. entries of people into the system

            entry_freqs = [entry_counts[str(level)][yr] for level in range(1, 5) if str(level) in entry_counts]
            entries_col = np.asarray(entry_freqs[::-1])[..., None]  # give it Nx1 shape; reverse order for 1 > 2 > 3...
            vac_trans_mat = np.append(vac_trans_mat, entries_col, 1)

            if yr in averaging_years:
                avg_vac_trans_mat = np.add(avg_vac_trans_mat, vac_trans_mat)

            vac_prob_mat = freq_mat_to_prob_mat(vac_trans_mat.tolist(), round_to=5)
            # add that transition probability matrix to table
            writer.writerow([profession.upper(), yr])
            header = ["", "Level 1", "Level 2", "Level 3", "Level 4", "Recruits"]
            writer.writerow(header)
            for i in range(len(vac_prob_mat)):
                writer.writerow([header[1:][i]] + vac_prob_mat[i])
            writer.writerow(["\n"])

        if averaging_years:
            avg_vac_trans_mat = np.divide(avg_vac_trans_mat, float(len(averaging_years) - 1))
            avg_vac_prob_mat = freq_mat_to_prob_mat(avg_vac_trans_mat.tolist(), round_to=5)
            writer.writerow(["AVERAGED ACROSS YEARS"] + averaging_years)
            header = ["", "Level 1", "Level 2", "Level 3", "Level 4", "Recruits"]
            writer.writerow(header)
            for i in range(len(avg_vac_prob_mat)):
                writer.writerow([header[1:][i]] + avg_vac_prob_mat[i])