def make_percent_pre_1990_table(person_year_table, profession, out_dir, out_dir_area_samp=None, area_sample=False):
    """
    Make a table that shows for for every given year the percentage of people in the system who entered the system prior
    to 1990. This is meant to show rate of decrease of socialist-era judges. Percentages are disaggregated by judicial
    level, and dump them in a csv table.

    NB: when running this metric on the sample, this function assumes that entries and departures of pre-1990 people
        into the sample balance out, so that the sampling itself doesn't influence the before-to-after 1990 ratio.

    :param person_year_table: a table of person-years, as a list of lists; assumes no header
    :param profession: string, "judges", "prosecutors", "notaries" or "executori".
    :param area_sample: bool, True if you want to exclusively use data from Alba, Iași, Craiova, and Ploiești
                        appeals areas/regions; False by default
    :param out_dir: str, directory where we want the non-area-sampled results table to live
    :param out_dir_area_samp: str, if given it's where we want the sample-area results table to live
    :return None
    """
    if area_sample:
        appellate_areas_to_sample = ["CA1", "CA7", "CA9", "CA12"]  # I hard code this in since it changes very rarely
        person_year_table = sample.appellate_area_sample(person_year_table, profession, appellate_areas_to_sample)
        out_dir = out_dir_area_samp

    # get handy column indexes
    yr_col_idx = helpers.get_header(profession, 'preprocess').index('an')
    pid_col_idx = helpers.get_header(profession, 'preprocess').index('cod persoană')
    lvl_col_idx = helpers.get_header(profession, 'preprocess').index('nivel')

    # sort table by person and year, then group table by persons
    person_year_table = sorted(person_year_table, key=itemgetter(pid_col_idx, yr_col_idx))
    people = [person for k, [*person] in itertools.groupby(person_year_table, key=itemgetter(pid_col_idx))]

    # get the span of years
    years = sorted(list({int(py[yr_col_idx]) for py in person_year_table}))

    # initialise the nested dict holding the data, in three layers: hierarchical levels, list of counts
    b4_1990_year_dict = {i: {yr: {"before 1990": 0, "total count": 0} for yr in years} for i in range(1, 5)}

    for person in people:
        first_career_year = int(person[0][yr_col_idx])
        for pers_yr in person:
            current_year = int(pers_yr[yr_col_idx])
            current_level = int(pers_yr[lvl_col_idx])
            b4_1990_year_dict[current_level][current_year]["total count"] += 1
            if first_career_year <= 1990:
                b4_1990_year_dict[current_level][current_year]["before 1990"] += 1

    # calculate percent from before 1990, only for 1990 and after (before 1990s it's always 100%)
    percs_lvl = {lvl: [] for lvl in b4_1990_year_dict}
    for lvl in b4_1990_year_dict:
        for yr in years:
            if yr >= 1990:
                percs_lvl[lvl].append(helpers.percent(b4_1990_year_dict[lvl][yr]["before 1990"],
                                                      b4_1990_year_dict[lvl][yr]["total count"]))

    # write each level timeseries to disk
    with open(out_dir + "percent_pre_1990.csv", "w") as out_f:
        writer = csv.writer(out_f)
        writer.writerow(["Hierarchical Level"] + [yr for yr in years if yr >= 1990])
        for lvl in b4_1990_year_dict:
            writer.writerow([lvl] + percs_lvl[lvl])
def percent_female(count_dict, units, unit_type=None):
    """
    Update the percent_female value in the count_dict

    :param count_dict: a dictionary of counts -- for format, see function metrics_dict
    :param units: a set of unit categories, each a string
    :param unit_type: None, or string; if string, type of the unit as it appears in header of person_year_table
                      (e.g. "camera")
    :return: None
    """
    # now get percent female per cohort, and per unit if applicable
    for year in count_dict['grand_total']:
        if count_dict['grand_total'][year]['total_size'] != 0:
            count_dict['grand_total'][year]['percent_female'] = helpers.percent(
                count_dict['grand_total'][year]['f'], count_dict['grand_total'][year]['total_size'])
        if unit_type:
            for u in units:
                if count_dict[u][year]['total_size'] != 0:
                    count_dict[u][year]['percent_female'] = helpers.percent(
                        count_dict[u][year]['f'], count_dict[u][year]['total_size'])
示例#3
0
def estimated_population_growth(estimated_pop, sampling_year_range):
    """
    Estimate how much the population grew between any two years, expressed in percentages.
    :param estimated_pop: dict: keys are years, values are estimated population sizes
    :param sampling_year_range: 2-tuple of ints, range of year's for which we're estimating mobility, e.g. (1998-2004)
    :return: dict of estimated growth percentages: keys are years, values are percentages
    """
    estimated_growth = {}
    for year in range(sampling_year_range[0] + 1, sampling_year_range[1] + 1):
        year_diff = estimated_pop[str(year)] - estimated_pop[str(year - 1)]
        year_percent_growth = helpers.percent(year_diff,
                                              estimated_pop[str(year - 1)])
        estimated_growth.update({str(year): year_percent_growth})
    return estimated_growth
def update_cohort_of_population(cohorts_dict, population_dict, entry=True, units=None):
    """
    Updates the value that shows how big a yearly cohort is relative to all the people in that year.

    NB: for entry cohorts, we compare cohort sizes to all people in the PREVIOUS year. For exit cohorts, we
        compare cohort sizes to all people in the CURRENT year.

    :param cohorts_dict: a dictionary of cohorts, where each key is a year and values are metrics for that cohort
    :param population_dict: a dictionary for the whole population, where each key is a year, and values are metrics
                            for all population members for that year
    :param entry: bool, True if we're getting data for entry cohorts, False if for exit cohorts
    :param units: a set of unique units of a certain type, e.g. towns
    :return: None
    """
    for year in cohorts_dict['grand_total']:

        # for entry cohorts, compare with preceding year, unless it's the first year
        if entry and year - 1 in cohorts_dict:
            yearly_pop = population_dict['grand_total'][year - 1]['total_size']
        else:
            yearly_pop = population_dict['grand_total'][year]['total_size']

        if cohorts_dict['grand_total'][year]['total_size'] != 0:
            cohorts_dict['grand_total'][year]['chrt_prcnt_of_pop'] = helpers.percent(
                cohorts_dict['grand_total'][year]['total_size'], yearly_pop)

        if units:
            for u in units:
                # for entry cohorts, compare with preceding year, unless it's the first year
                if entry and year - 1 in cohorts_dict:
                    yearly_unit_pop = population_dict[u][year - 1]['total_size']
                else:
                    yearly_unit_pop = population_dict[u][year]['total_size']

                if cohorts_dict[u][year]['total_size'] != 0:
                    cohorts_dict[u][year]['chrt_prcnt_of_pop'] = helpers.percent(cohorts_dict[u][year]['total_size'],
                                                                                 yearly_unit_pop)
示例#5
0
def inter_professional_transfers(multiprofs_py_table, out_dir, year_window):
    """
    Finds possible name matches between people who retired in year X from profession A, and people who joined
    professions B, C... in the years from X to X+4, inclusive. In other words, if someone left a profession one year,
    see if in the next five years they joined any of the other professions.

    NB: need to choose carefully the start and end years since only for some years do we have overlap between
        different professions

    NB: this function assumes that each match will be human-checked afterwards. Consequently, it errs on the side
        of over-inclusion, i.e. prefers false positives.

    :param multiprofs_py_table: person-year table of all professions
    :param out_dir: directory where the log of interprofessional transition matches will live
    :param year_window: int, how many years after exit we look for interprofessional transition;
                        if year_window = 0, we want only professional transfers in the exit year
                        if year_window = 3, we want only professional transfers in the exit year and two
                            consecutive years, e.g. 2000-2002 (the years 2000, 2001, and 2003)
                        etc.
    :return: None
    """

    # load the gender dict, we'll need this later
    gender_dict = gender.get_gender_dict()

    # get start and end year of all observations
    year_col_idx = helpers.get_header('all', 'combine').index('an')
    start_year, end_year = int(multiprofs_py_table[0][year_col_idx]), int(
        multiprofs_py_table[-1][year_col_idx])

    # initialise a list/log of matches/putative cross-professional transfers, so we can eyeball for errors
    transfer_match_log = []

    # for each profession get the first and last observation years and the full names of yearly entry and exit cohorts
    professions_data = professions_yearspans_cohorts(multiprofs_py_table,
                                                     combined=True)

    # make dict with level 1 key is year, level 2 key is sending profession, level 3 key is receiving profession;
    # level 4 dict holds counts: total count transfers from profession A to profession B in year X,
    # count women of those, percent women of those
    transfers_dict = {}
    measures = {
        'total transfers': 0,
        'women transfers': 0,
        'percent women transfers': 0
    }
    for exit_year in range(start_year, end_year):
        # the first-level key is the row/sender, the second-level key is the column/receiver
        professions_dict = {
            prof: {prof: deepcopy(measures)
                   for prof in professions_data}
            for prof in professions_data
        }
        transfers_dict.update({exit_year: professions_dict})

    # for each profession
    for sending_profession in professions_data:

        # for each yearly exit cohort
        for exit_year, names in professions_data[sending_profession][
                'exit'].items():

            # get set of entrants to OTHER professions, from exit year to year + year_window; e.g. [2000-2002]
            other_profs_entrants = other_professions_entrants(
                sending_profession, professions_data, exit_year, year_window)
            for exitee_name in names:

                # look for name match in set of entrants into other professions, in the specified time window
                for entrant in other_profs_entrants:
                    entrant_name, entry_year, entry_profession = entrant[
                        0], entrant[1], entrant[2]

                    # if names match
                    if name_match(exitee_name, entrant_name):
                        # add match to log for visual inspection
                        transfer_match_log.append([
                            exitee_name, exit_year, sending_profession, '',
                            entrant_name, entry_year, entry_profession
                        ])

                        # increment value of total counts in the transfer dict
                        transfers_dict[exit_year][sending_profession][
                            entry_profession]['total transfers'] += 1

                        # check if exitee name is female, if yes increment appropriate count in transfer dict
                        exitee_given_names = exitee_name.split(' | ')[1]
                        if gender.get_gender(exitee_given_names, exitee_name,
                                             gender_dict) == 'f':
                            transfers_dict[exit_year][sending_profession][
                                entry_profession]['women transfers'] += 1

            # for that year get percent female transfers
            for prof in professions_data:
                n = transfers_dict[exit_year][sending_profession][prof][
                    'women transfers']
                d = transfers_dict[exit_year][sending_profession][prof][
                    'total transfers']
                transfers_dict[exit_year][sending_profession][prof][
                    'percent women transfers'] = helpers.percent(n, d)

    # write the match list log to disk for visual inspection
    log_out_path = out_dir + 'interprofessional_transitions_' + str(
        year_window) + '_year_window_match_list_log.csv'
    with open(log_out_path, 'w') as out_p:
        writer = csv.writer(out_p)
        writer.writerow([
            "EXITEE NAME", "EXIT YEAR", "EXIT PROFESSION", "", "ENTRANT NAME",
            "ENTRY YEAR", "ENTRANT PROFESSION"
        ])
        for match in sorted(transfer_match_log,
                            key=itemgetter(1)):  # sorted by exit year
            writer.writerow(match)

    return transfers_dict
def prof_inherit_table(out_dir,
                       person_year_table,
                       profession,
                       year_window=1000,
                       num_top_names=0,
                       multi_name_robustness=False):
    """
    Puts the profession inheritance dict in a table, adding some pecentages and sums. Output table has header
    "YEAR", "MALE ENTRIES", "FEMALE ENTRIES", "TOTAL ENTRIES", "MALE INHERITANCE COUNT", "FEMALE INHERITANCE COUNT",
    "TOTAL INHERITANCE COUNT", "MALE INHERITANCE PERCENT", "FEMALE INHERITANCE PERCENT", "TOTAL INHERITANCE PERCENT"

    :param out_dir: directory where the inheritance table will live
    :param person_year_table: a table of person years as a list of lists
    :param year_window: int, how far back you want to look for inheritance; e.g. year_window == 4, we look four years
                        back, so if in 2004, we look back to 2000 (inclusive); Default is 1000, i.e. look at all years
    :param num_top_names: int, the number of top most frequent surnames that we consider the set of "common" surnames,
                          e.g. if num_top_names == 10, the ten surnames with the most associated people are considered
                          the "most common" surnames; Default is zero, i.e. no names are common
    :param profession: string, "judges", "prosecutors", "notaries" or "executori".
    :param multi_name_robustness: bool, True if we're running the multi-name robustness check
    :return: None
    """

    # get the inheritance dict
    inheritance_dict = profession_inheritance(
        out_dir,
        person_year_table,
        profession,
        year_window,
        num_top_names,
        multi_name_robustness=multi_name_robustness)
    sum_male_entries, sum_female_entries = 0, 0
    sum_male_inherit, sum_female_inherit = 0, 0

    if multi_name_robustness:
        table_out_path = out_dir + '/' + profession + '_MN_ROBUST' + '_exclude_surnames_above_rank_' \
                         + str(num_top_names) + '_inheritance_table.csv'
    else:
        table_out_path = out_dir + '/' + profession + '_exclude_surnames_above_rank_' + str(num_top_names) \
                         + '_inheritance_table.csv'

    with open(table_out_path, 'w') as out_p:
        writer = csv.writer(out_p)
        writer.writerow([profession.upper()])
        writer.writerow([
            "YEAR", "MALE ENTRIES", "FEMALE ENTRIES", "TOTAL ENTRIES",
            "MALE INHERITANCE COUNT", "FEMALE INHERITANCE COUNT",
            "TOTAL INHERITANCE COUNT", "MALE INHERITANCE PERCENT",
            "FEMALE INHERITANCE PERCENT", "TOTAL INHERITANCE PERCENT"
        ])

        # for each year in the inheritance dict
        for year, counts in inheritance_dict.items():
            # increment counters
            sum_male_entries += counts["male entrants"]
            sum_female_entries += counts["female entrants"]
            sum_male_inherit += counts["male inherit"]
            sum_female_inherit += counts["female inherit"]

            # get sums and percentages
            total_entries = counts["female entrants"] + counts["male entrants"]
            total_inherit = counts["female inherit"] + counts["male inherit"]
            female_inherit_percent = helpers.percent(counts["female inherit"],
                                                     counts["female entrants"])
            male_inherit_percent = helpers.percent(counts["male inherit"],
                                                   counts["male entrants"])
            total_inherit_percent = helpers.percent(total_inherit,
                                                    total_entries)

            writer.writerow([
                year, counts["male entrants"], counts["female entrants"],
                total_entries, counts["male inherit"],
                counts["female inherit"], total_inherit, male_inherit_percent,
                female_inherit_percent, total_inherit_percent
            ])

        global_percent_male_inherit = helpers.percent(sum_male_inherit,
                                                      sum_male_entries)
        global_percent_female_inherit = helpers.percent(
            sum_female_inherit, sum_female_entries)
        global_percent_total_inherit = helpers.percent(
            sum_male_inherit + sum_female_inherit,
            sum_male_entries + sum_female_entries)

        writer.writerow([
            "GLOBAL", sum_male_entries, sum_female_entries,
            sum_male_entries + sum_female_entries, sum_male_inherit,
            sum_female_inherit, sum_male_inherit + sum_female_inherit,
            global_percent_male_inherit, global_percent_female_inherit,
            global_percent_total_inherit
        ])
def hierarchical_mobility(person_year_table, profession):
    """
    Finds how many people, each year, moved up, down, or across (i.e. between geographic units in the same level) from
    their level in the judicial hierarchy, deaggregating mobility by gender. The levels are
    {1: low court, 2: tribunal, 3: appellate court, 4: high court}.  The output dict has the following format:

    {"year": {
        "level1" : {
            "up": {"m": int, "f": int, "dk": int, "total": int, "percent female": int},
             "down": {"m": int, "f": int, "dk": int, "total": int, "percent female": int},
             "across": {"m": int, "f": int, "dk": int, "total": int, "percent female": int}
             },
        "level2": {
            "up": {"m": int, "f": int, "dk": int, "total": int, "percent female": int},
            ...
            },
        ...
        },
    "year2"
    ...
    }

    NB: "m" = male, "f" = "female", "dk" = "don't know".

    NB: there is no "down" for low courts, or "up" and "across" for the high court.

    NB: data on retirements ("out") come via exit cohorts from the function "pop_cohort_counts".

    NB: only judges and prosecutors have a hierarchical system -- this function is not sensical for notaries, executori,
        and lawyers.

    :param person_year_table: a table of person-years, as a list of lists
    :param profession: string, "judges", "prosecutors", "notaries" or "executori".
    :return: a dict of mobility info
    """

    # get column indexes
    pid_col_idx = helpers.get_header(profession, 'preprocess').index('cod persoană')
    gender_col_idx = helpers.get_header(profession, 'preprocess').index('sex')
    year_col_idx = helpers.get_header(profession, 'preprocess').index('an')
    level_col_idx = helpers.get_header(profession, 'preprocess').index('nivel')
    jud_col_idx = helpers.get_header(profession, 'preprocess').index('jud cod')
    trib_col_idx = helpers.get_header(profession, 'preprocess').index('trib cod')
    ca_col_idx = helpers.get_header(profession, 'preprocess').index('ca cod')

    # get the year range and set the mobility types
    years = list(sorted({py[year_col_idx] for py in person_year_table}))
    mobility_types = ["across", "down", "up"]

    # initialise the mobility dict
    mob_dict = {year: {lvl: {mob_type: {"m": 0, "f": 0, "dk": 0, "total": 0, "percent female": 0}
                             for mob_type in mobility_types} for lvl in range(1, 5)} for year in years}

    # group the person-year table by unique person IDs, i.e. by people
    person_year_table.sort(key=itemgetter(pid_col_idx, year_col_idx))  # sort by person ID and year
    people = [person for key, [*person] in itertools.groupby(person_year_table, key=itemgetter(pid_col_idx))]

    # fill in the mobility dict
    for pers in people:
        gend = pers[0][gender_col_idx]
        for idx, pers_year in enumerate(pers):
            # by convention we say there's mobility in this year if next year's location is different
            if idx < len(pers) - 1:
                year, level = pers_year[year_col_idx], int(pers_year[level_col_idx])
                if level < int(pers[idx + 1][level_col_idx]):
                    mob_dict[year][level]["up"][gend] += 1
                elif level > int(pers[idx + 1][level_col_idx]):
                    mob_dict[year][level]["down"][gend] += 1
                else:
                    # need to compare this year and next year's unit to see if they moved laterally
                    # each unit is uniquely identified by it's three-level hierarchical code
                    current_unit = '|'.join([pers_year[jud_col_idx], pers_year[trib_col_idx], pers_year[ca_col_idx]])
                    next_unit = '|'.join(
                        [pers[idx + 1][jud_col_idx], pers[idx + 1][trib_col_idx], pers[idx + 1][ca_col_idx]])
                    if current_unit != next_unit:
                        mob_dict[year][level]["across"][gend] += 1

    # update the aggregate values
    for year, levels in mob_dict.items():
        for lvl, mobility_type in levels.items():
            for mob in mobility_type:
                mob_dict[year][lvl][mob]["total"] = sum([mob_dict[year][lvl][mob]["m"], mob_dict[year][lvl][mob]["f"],
                                                         mob_dict[year][lvl][mob]["dk"]])
                mob_dict[year][lvl][mob]["percent female"] = helpers.percent(mob_dict[year][lvl][mob]["f"],
                                                                             mob_dict[year][lvl][mob]["total"])

    return mob_dict
def career_climbings(person_year_table, profession, use_cohorts, first_x_years):
    """
    Return a dict of metrics on career climbing, i.e. of moving up the judicial hierarchy.

    NB: these metrics are only for a subset of observations, namely those specified by use_cohorts. The purpose of this
        feature is to help us avoid years with rotten data, while giving us a big enough time interval to catch
        movement up two levels

    We want two pieces of information:

    a) total counts and % female of those who stay in low courts, climb to tribunals, and climb to appellate courts
    b) average time it took to climb, whether to tribunal or appellate court, for those cohort members who climbed to
        those levels

    :param person_year_table: a table of person-years, as a list of lists
    :param profession: string, "judges", "prosecutors", "notaries" or "executori".
    :param use_cohorts: list of ints, each int represents a year for which you analyse entry cohorts, e.g. [2006, 2007]
    :param first_x_years: int, the number of years from start of career that we condsider, e.g. ten years since entry
    :return:
    """

    # get column indexes
    pid_col_idx = helpers.get_header(profession, 'preprocess').index('cod persoană')
    year_col_idx = helpers.get_header(profession, 'preprocess').index('an')
    gender_col_idx = helpers.get_header(profession, 'preprocess').index('sex')

    # sort by unique person ID and year, then group by person-year
    person_year_table.sort(key=itemgetter(pid_col_idx, year_col_idx))
    people = [person for key, [*person] in itertools.groupby(person_year_table, key=itemgetter(pid_col_idx))]

    # initialise dict that breaks down careers by how high they climbed
    counts_dict = {'m': 0, 'f': 0, 'dk': 0, 'total': 0, 'percent female': 0, 'avrg yrs to promotion': 0}
    levels = ['low court', 'tribunal', 'appellate', 'high court']
    careers_by_levels = {lvl: {'career type table': [], 'counts dict': deepcopy(counts_dict)} for lvl in levels}
    fill_careers_by_levels_dict(people, profession, use_cohorts, careers_by_levels)

    # for each career type get basic descriptives
    for step, info in careers_by_levels.items():
        times_to_promotion = []
        for pers in info['career type table']:
            gend = pers[0][gender_col_idx]

            # see time it takes to climb hierarchy; use only first X years of career, to make comparable
            # careers of different total length
            t_to_promotion = time_to_promotion(pers, profession, step, first_x_years)

            # if person jumped seniority requirements (e.g. came from different legal profession), or has > ten years
            # (this is an error, since time_to_promotion should only keep first ten years), ignore

            if t_to_promotion == 'NA':  # catches low court people
                info['counts dict'][gend] += 1
            else:  # t_to_promotion != 'NA', i.e. everyone else
                if min_time_promotion(step) <= t_to_promotion < 11:
                    times_to_promotion.append(t_to_promotion)  # save time to promotion
                    info['counts dict'][gend] += 1

        info['counts dict']['total'] = info['counts dict']['f'] + info['counts dict']['m'] + info['counts dict']['dk']
        info['counts dict']['percent female'] = helpers.percent(info['counts dict']['f'], info['counts dict']['total'])
        info['counts dict']['avrg yrs to promotion'] = 'NA' if 'NA' in times_to_promotion or times_to_promotion == [] \
            else round(statistics.mean(times_to_promotion))

    return careers_by_levels